From b5a0a325878b5374583d680520f3b48a69951f6d Mon Sep 17 00:00:00 2001 From: Andrea Ovalle Date: Thu, 9 Mar 2023 10:07:10 -0500 Subject: [PATCH 1/2] New pdf notebook --- examples/annotation_import/pdf.ipynb | 535 +++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 77 deletions(-) diff --git a/examples/annotation_import/pdf.ipynb b/examples/annotation_import/pdf.ipynb index 0f9f2e037..e96feaa59 100644 --- a/examples/annotation_import/pdf.ipynb +++ b/examples/annotation_import/pdf.ipynb @@ -37,26 +37,42 @@ { "metadata": {}, "source": [ - "* Notes:\n", - " * Wait until the import job is complete before opening the Editor to make sure all annotations are imported properly." + "#### PDF Prediction Import\n", + "Supported annotations for PDF assets with text layers\n", + "\n", + "#### Entity annotations\n", + "Supported annotations for PDF assets without text layers\n", + "\n", + "*Annotation types*\n", + "- Checklist classification (including nested classifications)\n", + "- Radio classifications (including nested classifications)\n", + "- Free text classifications\n", + "\n", + "\n", + "*NDJson*\n", + "- Checklist classification (including nested classifications)\n", + "- Radio classifications (including nested classifications)\n", + "- Free text classifications\n", + "- Bounding box \n", + "- Entities " ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "!pip install -q 'labelbox[data]'" + "### Setup" ], - "cell_type": "code", - "outputs": [], - "execution_count": null + "cell_type": "markdown" }, { "metadata": {}, "source": [ - "# Imports" + "!pip install -q 'labelbox[data]'" ], - "cell_type": "markdown" + "cell_type": "code", + "outputs": [], + "execution_count": null }, { "metadata": {}, @@ -65,7 +81,10 @@ "import labelbox.types as lb_types\n", "import uuid\n", "from uuid import uuid4\n", - "import json" + "import json\n", + "import uuid\n", + "import numpy as np\n", + "from labelbox.schema.queue_mode import QueueMode" ], "cell_type": "code", "outputs": [], @@ -74,8 +93,8 @@ { "metadata": {}, "source": [ - "# API Key and Client\n", - "Provide a valid api key below in order to properly connect to the Labelbox Client." + "### Replace with your API key\n", + "Guides on https://docs.labelbox.com/docs/create-an-api-key" ], "cell_type": "markdown" }, @@ -93,42 +112,108 @@ { "metadata": {}, "source": [ - "---- \n", - "### Steps\n", - "1. Make sure project is setup\n", - "2. Collect annotations\n", - "3. Upload" + "### Supported Annotations" ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "### Step 1: Project setup" + "########## Entity ##########\n", + "# NDJSON\n", + "entities_annotations_ndjson = { \n", + " \"name\": \"named_entity\",\n", + " \"textSelections\": [\n", + " {\n", + " \"tokenIds\": [\n", + " \"\",\n", + " ],\n", + " \"groupId\": \"\",\n", + " \"page\": 1,\n", + " }\n", + " ]\n", + "}" ], - "cell_type": "markdown" + "cell_type": "code", + "outputs": [], + "execution_count": null }, { "metadata": {}, "source": [ - "First, we create an ontology with all the possible tools and classifications supported for PDF. The official list of supported annotations to import can be found here:\n", - "- [Model-Assisted Labeling](https://docs.labelbox.com/docs/model-assisted-labeling) (annotations/labels are not submitted)\n", - "- [PDF Annotations](https://docs.labelbox.com/docs/document-annotations)" + "########### Radio Classification #########\n", + "\n", + "# Annotation types \n", + "radio_annotation = lb_types.ClassificationAnnotation(\n", + " name=\"radio_question\",\n", + " value=lb_types.Radio(answer = \n", + " lb_types.ClassificationAnswer(name = \"first_radio_answer\")\n", + " )\n", + ")\n", + "# NDJSON\n", + "radio_annotation_ndjson = {\n", + " 'name': 'radio_question',\n", + " 'answer': {'name': 'first_radio_answer'}\n", + "}" ], - "cell_type": "markdown" + "cell_type": "code", + "outputs": [], + "execution_count": null }, { "metadata": {}, "source": [ - "ontology_builder = lb.OntologyBuilder(\n", - " tools=[ \n", - " lb.Tool( # Entity tool given the name \"NER\"\n", - " tool=lb.Tool.Type.NER, \n", - " name= \"NER\"),\n", - " lb.Tool( # Relationship tool given the name \"relationship\"\n", - " tool=lb.Tool.Type.RELATIONSHIP, \n", - " name=\"relationship\")]\n", - " )" + "############ Checklist Classification ###########\n", + "\n", + "# Annotation types \n", + "checklist_annotation = lb_types.ClassificationAnnotation(\n", + " name=\"checklist_question\",\n", + " value=lb_types.Checklist(answer = [\n", + " lb_types.ClassificationAnswer(name = \"first_checklist_answer\"),\n", + " lb_types.ClassificationAnswer(name = \"second_checklist_answer\")\n", + " ])\n", + " )\n", + "\n", + "\n", + "# NDJSON\n", + "checklist_annotation_ndjson = {\n", + " 'name': 'checklist_question',\n", + " 'answer': [\n", + " {'name': 'first_checklist_answer'},\n", + " {'name': 'second_checklist_answer'}\n", + " ]\n", + "}" + ], + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "############ Bounding Box ###########\n", + "# Python Annotation \n", + "bbox_annotation = lb_types.ObjectAnnotation(\n", + " name = \"bounding_box\", # must match your ontology feature's name\n", + " value = lb_types.Rectangle(\n", + " start=lb_types.Point(x=42.799, y=86.498), # Top left\n", + " end=lb_types.Point(x=141.911, y=303.195), # Bottom right\n", + " ),\n", + " page = 1,\n", + " unit=\"POINTS\"\n", + ")\n", + "\n", + "bbox_annotation_ndjson = {\n", + " 'name': 'bounding_box',\n", + " 'bbox': {\n", + " \"top\": 42.799,\n", + " \"left\": 86.498,\n", + " \"height\": 141.911,\n", + " \"width\": 303.195\n", + " },\n", + " 'page': 0,\n", + " 'unit': \"POINTS\"\n", + "}" ], "cell_type": "code", "outputs": [], @@ -137,7 +222,32 @@ { "metadata": {}, "source": [ - "ontology = client.create_ontology(\"pdf-entity-import-ontology\", ontology_builder.asdict())" + "# ############ nested classifications ###########\n", + "\n", + "nested_checklist_annotation_ndjson = {\n", + " \"name\": \"nested_checklist_question\",\n", + " \"answer\": [{\n", + " \"name\": \"first_checklist_answer\", \n", + " \"classifications\" : [\n", + " {\n", + " \"name\": \"sub_checklist_question\", \n", + " \"answer\": {\"name\": \"first_sub_checklist_answer\"}\n", + " } \n", + " ] \n", + " }]\n", + "}\n", + "\n", + "nested_radio_annotation_ndjson = {\n", + " 'name': 'nested_radio_question',\n", + " 'answer': {\n", + " 'name': 'first_radio_answer',\n", + " 'classifications': [{\n", + " 'name':'sub_radio_question',\n", + " 'answer': { 'name' : 'first_sub_radio_answer'}\n", + " }]\n", + " }\n", + "}\n", + "\n" ], "cell_type": "code", "outputs": [], @@ -146,40 +256,73 @@ { "metadata": {}, "source": [ - "# Create Labelbox project\n", - "mal_project = client.create_project(name=\"pdf_entity_import\", media_type=lb.MediaType.Document)\n", + "############## Classification Free-form text ############## \n", + "\n", + "text_annotation = lb_types.ClassificationAnnotation(\n", + " name=\"free_text\", # must match your ontology feature's name\n", + " value=lb_types.Text(answer=\"sample text\")\n", + ")\n", "\n", - "# Create one Labelbox dataset\n", - "dataset = client.create_dataset(name=\"pdf_entity_import_dataset\")\n", "\n", - "# Grab an example pdf and create a Labelbox data row\n", - "asset = [\n", - " {\n", + "text_annotation_ndjson = {\n", + " 'name': 'free_text',\n", + " 'answer': 'sample text'\n", + "}" + ], + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "## Upload Annotations - putting it all together " + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "### Step 1: Import data rows into Catalog " + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "## Text layer url is required for uploading entity annotations\n", + "img_url = {\n", " \"row_data\": {\n", " \"pdf_url\": \"https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf\",\n", " \"text_layer_url\": \"https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json\"\n", - " }\n", - " }\n", - "]\n", + " },\n", + " \"global_key\": str(uuid.uuid4())\n", + "}\n", + "\n", + "\n", + "dataset = client.create_dataset(name=\"pdf_demo_dataset\")\n", + "\n", + "data_row = dataset.create_data_row(img_url)\n", "\n", - "task = dataset.create_data_rows(asset)\n", - "task.wait_till_done()\n", - "print(task.errors)\n" + "print(data_row)" ], "cell_type": "code", "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:labelbox.client:Default createProject behavior will soon be adjusted to prefer batch projects. Pass in `queue_mode` parameter explicitly to opt-out for the time being.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "None\n" + "\n" ] } ], @@ -188,12 +331,86 @@ { "metadata": {}, "source": [ + "### Step 2: Create/select an Ontology for your project\n", + "Your project should have the correct ontology setup with all the tools and classifications supported for your annotations, and the tool names and classification instructions should match the name/instructions fields in your annotations to ensure the correct feature schemas are matched." + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "## Setup the ontology and link the tools created above.\n", + "\n", + "ontology_builder = lb.OntologyBuilder(\n", + " classifications=[ # List of Classification objects\n", + " lb.Classification( \n", + " class_type=lb.Classification.Type.RADIO,\n", + " name=\"radio_question\", \n", + " scope = lb.Classification.Scope.GLOBAL,\n", + " options=[\n", + " lb.Option(value=\"first_radio_answer\"),\n", + " lb.Option(value=\"second_radio_answer\")\n", + " ]\n", + " ),\n", + " lb.Classification(\n", + " class_type=lb.Classification.Type.CHECKLIST,\n", + " name=\"checklist_question\", \n", + " scope = lb.Classification.Scope.GLOBAL,\n", + " options=[\n", + " lb.Option(value=\"first_checklist_answer\"),\n", + " lb.Option(value=\"second_checklist_answer\")\n", + " ]\n", + " ), \n", + " lb.Classification(\n", + " class_type=lb.Classification.Type.TEXT,\n", + " name=\"free_text\",\n", + " scope = lb.Classification.Scope.GLOBAL\n", + " ),\n", + " lb.Classification(\n", + " class_type=lb.Classification.Type.RADIO,\n", + " name=\"nested_radio_question\",\n", + " scope = lb.Classification.Scope.GLOBAL,\n", + " options=[\n", + " lb.Option(\"first_radio_answer\",\n", + " options=[\n", + " lb.Classification(\n", + " class_type=lb.Classification.Type.RADIO,\n", + " name=\"sub_radio_question\",\n", + " options=[lb.Option(\"first_sub_radio_answer\")]\n", + " )\n", + " ]\n", + " )\n", + " ] \n", + " ),\n", + " lb.Classification(\n", + " class_type=lb.Classification.Type.CHECKLIST,\n", + " name=\"nested_checklist_question\",\n", + " scope = lb.Classification.Scope.GLOBAL,\n", + " options=[\n", + " lb.Option(\"first_checklist_answer\",\n", + " options=[\n", + " lb.Classification(\n", + " class_type=lb.Classification.Type.CHECKLIST,\n", + " name=\"sub_checklist_question\", \n", + " options=[lb.Option(\"first_sub_checklist_answer\")]\n", + " )\n", + " ]\n", + " )\n", + " ]\n", + " ), \n", + " ],\n", + " tools=[ # List of Tool objects\n", + " lb.Tool( \n", + " tool=lb.Tool.Type.BBOX,\n", + " name=\"bounding_box\"), \n", + " lb.Tool(\n", + " tool=lb.Tool.Type.NER, \n", + " name=\"named_entity\")]\n", + ")\n", "\n", - "data_row_id = next(dataset.data_rows())\n", - "# Connect your ontology and editor to your MAL project\n", - "mal_project.setup_editor(ontology) \n", - "# Connect your dataset to your MAL project\n", - "batch = mal_project.create_batch('test-batch_' + str(uuid4()), [data_row_id] , 5)" + "ontology = client.create_ontology(\"Document Annotation Import Demo\",\n", + " ontology_builder.asdict(),\n", + " media_type=lb.MediaType.Document)" ], "cell_type": "code", "outputs": [], @@ -202,26 +419,102 @@ { "metadata": {}, "source": [ - "### Step 2: create entity annotation" + "### Step 3: Creating a labeling project" ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "ANNOTATION = {\n", - " \"name\" : tool_name,\n", - " \"dataRow\": {\"id\": data_row_id.uid},\n", - " \"textSelections\": [\n", + "# Create a Labelbox project\n", + "project = client.create_project(name=\"PDF_annotation_demo\", \n", + " queue_mode=QueueMode.Batch,\n", + " media_type=lb.MediaType.Document)\n", + "project.setup_editor(ontology)" + ], + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "### Step 4: Send a batch of data rows to the project" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "project.create_batch(\n", + " \"PDF_annotation_batch\", # Each batch in a project must have a unique name\n", + " dataset.export_data_rows(), # A list of data rows or data row ids\n", + " 5 # priority between 1(Highest) - 5(lowest)\n", + ")" + ], + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "### Step 5. Create the annotation payload\n", + "Create the annotations payload using the snippets of code in Supported predictions section.\n", + "\n", + "Labelbox support NDJSON only for this data type.\n", + "\n", + "The resulting label_ndjson should have exactly the same content for annotations that are supported by both (with exception of the uuid strings that are generated)" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "## We need to construct our entity annotation using our text layer. \n", + "\n", + "import requests\n", + "import json\n", + "\n", + "## To learn how to generate a text layer for your documents please refer to the following repositories/files: \n", + "# https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/gcloud/gcp-vision-to-lb-text-layer.py\n", + "# https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/adobe/adobe-ocr-to-lb-text-layer.py\n", + "\n", + "text_layer = \"https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json\"\n", + "\n", + "## Fetch the content of the text layer\n", + "res = requests.get(text_layer) \n", + "\n", + "## Parse the text layer\n", + "for obj in json.loads(res.text):\n", + " for group in obj['groups']: \n", + " ## Find the text group that we are interested in annotating\n", + " if group['content'] == \"Metal-insulator (MI) transitions have been one of the\":\n", + " ## We now need all the tokens associated with each word in this text group\n", + " list_tokens = [x['id'] for x in group['tokens']]\n", + " entities_annotations_ndjson.update(\n", " {\n", - " \"tokenIds\": [\n", - " \"521f705e-b276-4ac7-8e5b-2e38e037f80f\", # superconductivity\n", - " ],\n", - " \"groupId\": \"ed53dd86-ef39-4634-9505-ee0eebedef44\",\n", - " \"page\": 1,\n", + " \"textSelections\": [\n", + " {\n", + " \"groupId\": group['id'], #id associated with the group of words\n", + " \"tokenIds\": list_tokens, #id associated with each word in a sentence group\n", + " \"page\": 1,\n", + " }\n", + " ]\n", " }\n", - " ],\n", - "}" + " )\n", + " " ], "cell_type": "code", "outputs": [], @@ -230,14 +523,28 @@ { "metadata": {}, "source": [ - "### Step 3 upload annotations" + "#### Python annotation\n", + "Here we create the complete label ndjson payload of annotations only using python annotation format. There is one annotation for each reference to an annotation that we created. Note that only a handful of python annotation types are supported for PDF documents." ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "task = lb.MALPredictionImport.create_from_objects(client, mal_project.uid, str(uuid.uuid4()), [ANNOTATION])" + "# create a Label\n", + "\n", + "label = []\n", + "for data_row in dataset.export_data_rows():\n", + " label.append(lb_types.Label(\n", + " data=lb_types.TextData(\n", + " uid=data_row.uid),\n", + " annotations = [\n", + " checklist_annotation, \n", + " text_annotation,\n", + " radio_annotation\n", + " ]\n", + " )\n", + ")" ], "cell_type": "code", "outputs": [], @@ -246,7 +553,30 @@ { "metadata": {}, "source": [ - "task.wait_until_done()" + "#### NDJson annotations\n", + "Here we create the complete label ndjson payload of annotations only using NDJSON format. There is one annotation for each reference to an annotation that we created above." + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "\n", + "ndjson_annotation = []\n", + "for annot in [\n", + " entities_annotations_ndjson,\n", + " bbox_annotation_ndjson,\n", + " text_annotation_ndjson,\n", + " checklist_annotation_ndjson,\n", + " nested_checklist_annotation_ndjson,\n", + " nested_radio_annotation_ndjson,\n", + " radio_annotation_ndjson\n", + " ]:\n", + " annot.update({\n", + " 'dataRow': {'id': data_row.uid},\n", + " })\n", + " ndjson_annotation.append(annot)\n", + "\n" ], "cell_type": "code", "outputs": [], @@ -255,8 +585,60 @@ { "metadata": {}, "source": [ - "print(task.errors)\n", - "print(task.statuses)" + "### Step 6: Import the annotation payload" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "Option A: Upload to a labeling project as pre-labels (MAL)" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "upload_job = lb.MALPredictionImport.create_from_objects(\n", + " client = client,\n", + " project_id = project.uid,\n", + " name=\"pdf_annotation_upload\" + str(uuid.uuid4()),\n", + " predictions=ndjson_annotation)\n", + "\n", + "upload_job.wait_until_done()\n", + "# Errors will appear for annotation uploads that failed.\n", + "print(\"Errors:\", upload_job.errors)" + ], + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Errors: []\n" + ] + } + ], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "Option B: Upload to a labeling project using ground truth" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "\n", + "upload_job = lb.LabelImport.create_from_objects(\n", + " client = client, \n", + " project_id = project.uid, \n", + " name=\"label_import_job\"+str(uuid.uuid4()), \n", + " labels=ndjson_annotation)\n", + "\n", + "print(\"Errors:\", upload_job.errors)" ], "cell_type": "code", "outputs": [ @@ -264,8 +646,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[]\n", - "[{'uuid': 'dffe7d75-1c61-4dcd-a0d4-804438cf540b', 'dataRow': {'id': 'clb5cdf1c6u0a077d87kffy28'}, 'status': 'SUCCESS'}]\n" + "Errors: []\n" ] } ], From 9b7887274c9fc6a11b1e5d292cc57aa7e447b269 Mon Sep 17 00:00:00 2001 From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com> Date: Thu, 9 Mar 2023 10:22:04 -0500 Subject: [PATCH 2/2] Removed "Prediction" from text --- examples/annotation_import/pdf.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/annotation_import/pdf.ipynb b/examples/annotation_import/pdf.ipynb index e96feaa59..47264c02b 100644 --- a/examples/annotation_import/pdf.ipynb +++ b/examples/annotation_import/pdf.ipynb @@ -37,7 +37,7 @@ { "metadata": {}, "source": [ - "#### PDF Prediction Import\n", + "#### PDF Annotation Import\n", "Supported annotations for PDF assets with text layers\n", "\n", "#### Entity annotations\n", @@ -653,4 +653,4 @@ "execution_count": null } ] -} \ No newline at end of file +}