<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# PDF Annotation Import


Supported annotations for PDF assets 

*Annotation types*
- Checklist classification 
- Radio classifications 
- Free text classifications
- Entities


*NDJson*
- Checklist classification (including nested classifications)
- Radio classifications (including nested classifications)
- Free text classifications
- Bounding box 
- Entities 

### Setup

In [None]:
!pip install -q 'labelbox[data]'

In [None]:
from typing import cast
import uuid
from uuid import uuid4

import labelbox as lb
import labelbox.types as lb_types
from labelbox.schema.queue_mode import QueueMode

### Replace with your API key
Guides on https://docs.labelbox.com/docs/create-an-api-key

In [None]:
# Add your api key
API_KEY = ""
client = lb.Client(api_key=API_KEY)

### Supported Annotations

In [None]:
########## Entity ##########

# Annotation Types
entities_annotations = lb_types.ObjectAnnotation(
    name="named_entity",
    value= lb_types.DocumentEntity(
        name="named_entity",
        textSelections=[
            lb_types.DocumentTextSelection(
                token_ids=[],
                group_id="",
                page=1
            )
        ]
    )
)
# NDJson
entities_annotations_ndjson = { 
    "name": "named_entity",
    "textSelections": [
        {
            "tokenIds": [
                "<UUID>",
            ],
            "groupId": "<UUID>",
            "page": 1,
        }
    ]
}

In [None]:
########### Radio Classification #########

# Annotation types 
radio_annotation = lb_types.ClassificationAnnotation(
    name="radio_question",
    value=lb_types.Radio(answer = 
        lb_types.ClassificationAnswer(name = "first_radio_answer")
    )
)
# NDJSON
radio_annotation_ndjson = {
  'name': 'radio_question',
  'answer': {'name': 'first_radio_answer'}
}

In [None]:
############ Checklist Classification ###########

# Annotation types 
checklist_annotation = lb_types.ClassificationAnnotation(
    name="checklist_question",
    value=lb_types.Checklist(answer = [
        lb_types.ClassificationAnswer(name = "first_checklist_answer"),
        lb_types.ClassificationAnswer(name = "second_checklist_answer")
    ])
  )


# NDJSON
checklist_annotation_ndjson = {
  'name': 'checklist_question',
  'answer': [
    {'name': 'first_checklist_answer'},
    {'name': 'second_checklist_answer'}
  ]
}

In [None]:
############ Bounding Box ###########
# Python Annotation 

bbox_annotation_ndjson = {
  'name': 'bounding_box',
  'bbox': {
          "top": 42.799,
          "left": 86.498,
          "height": 141.911,
          "width": 303.195
      },
  'page': 0,
  'unit': "POINTS"
}

In [None]:
# ############ nested classifications ###########

nested_checklist_annotation_ndjson = {
  "name": "nested_checklist_question",
  "answer": [{
      "name": "first_checklist_answer", 
      "classifications" : [
        {
          "name": "sub_checklist_question", 
          "answer": {"name": "first_sub_checklist_answer"}
        }          
      ]         
  }]
}

nested_radio_annotation_ndjson = {
  'name': 'nested_radio_question',
  'answer': {
      'name': 'first_radio_answer',
      'classifications': [{
          'name':'sub_radio_question',
          'answer': { 'name' : 'first_sub_radio_answer'}
        }]
    }
}



In [None]:
############## Classification Free-form text ############## 

text_annotation = lb_types.ClassificationAnnotation(
  name="free_text",  # must match your ontology feature's name
  value=lb_types.Text(answer="sample text")
)


text_annotation_ndjson = {
  'name': 'free_text',
  'answer': 'sample text'
}

## Upload Annotations - putting it all together 

### Step 1: Import data rows into Catalog 

In [None]:
## Text layer url is required for uploading entity annotations
img_url = {
    "row_data": {
      "pdf_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
      "text_layer_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"
    },
    "global_key": str(uuid.uuid4())
}


dataset = client.create_dataset(name="pdf_demo_dataset")

data_row = dataset.create_data_row(img_url)

print(data_row)

<DataRow {
    "created_at": "2023-03-15 13:36:25+00:00",
    "external_id": null,
    "global_key": "280c1bfa-e1c3-463a-8d10-3585dc013026",
    "media_attributes": {},
    "metadata": [],
    "metadata_fields": [],
    "row_data": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
    "uid": "clf9q77v80b5v07y4e3kdevdz",
    "updated_at": "2023-03-15 13:36:25+00:00"
}>


### Step 2: Create/select an Ontology for your project
Your project should have the correct ontology setup with all the tools and classifications supported for your annotations, and the tool names and classification instructions should match the name/instructions fields in your annotations to ensure the correct feature schemas are matched.

In [None]:
## Setup the ontology and link the tools created above.

ontology_builder = lb.OntologyBuilder(
  classifications=[ # List of Classification objects
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO,
      name="radio_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_radio_answer"),
        lb.Option(value="second_radio_answer")
      ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="checklist_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer")
      ]
    ), 
    lb.Classification(
      class_type=lb.Classification.Type.TEXT,
      name="free_text",
      scope = lb.Classification.Scope.GLOBAL
    ),
    lb.Classification(
        class_type=lb.Classification.Type.RADIO,
        name="nested_radio_question",
        scope = lb.Classification.Scope.GLOBAL,
        options=[
            lb.Option("first_radio_answer",
                options=[
                    lb.Classification(
                        class_type=lb.Classification.Type.RADIO,
                        name="sub_radio_question",
                        options=[lb.Option("first_sub_radio_answer")]
                    )
                ]
            )
          ] 
        ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      scope = lb.Classification.Scope.GLOBAL,
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question", 
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ]
        )
      ]
    ),      
  ],
  tools=[ # List of Tool objects
    lb.Tool( 
      tool=lb.Tool.Type.BBOX,
      name="bounding_box"), 
    lb.Tool(
        tool=lb.Tool.Type.NER, 
        name="named_entity")]
)

ontology = client.create_ontology("Document Annotation Import Demo",
                                  ontology_builder.asdict(),
                                  media_type=lb.MediaType.Document)

### Step 3: Creating a labeling project

In [None]:
# Create a Labelbox project
project = client.create_project(name="PDF_annotation_demo",                                    
                                    queue_mode=QueueMode.Batch,
                                    media_type=lb.MediaType.Document)
project.setup_editor(ontology)

### Step 4: Send a batch of data rows to the project

In [None]:
project.create_batch(
  "PDF_annotation_batch", # Each batch in a project must have a unique name
  dataset.export_data_rows(), # A list of data rows or data row ids
  5 # priority between 1(Highest) - 5(lowest)
)

<Batch ID: 6b318090-c336-11ed-ba0b-0b34f53f6067>

### Step 5. Create the annotation payload
Create the annotations payload using the snippets of code in Supported predictions section.

Labelbox support NDJSON only for this data type.

The resulting label_ndjson should have exactly the same content for annotations that are supported by both (with exception of the uuid strings that are generated)

In [None]:
## We need to construct our entity annotation using our text layer. 

import requests
import json


## To learn how to generate a text layer for your documents please refer to the following repositories/files: 
# https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/gcloud/gcp-vision-to-lb-text-layer.py
# https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/adobe/adobe-ocr-to-lb-text-layer.py

text_layer = "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"

## Fetch the content of the text layer
res = requests.get(text_layer) 



## Parse the text layer
text_selections = []
for obj in json.loads(res.text):
  for group in obj['groups']: 
    ## Find the text group that we are interested in annotating
    if group['content'] == "Metal-insulator (MI) transitions have been one of the":
      ## We now need all the tokens associated with each word in this text group
      list_tokens = [x['id'] for x in group['tokens']]
      ## build text selections for Annotation Types
      document_text_selection = lb_types.DocumentTextSelection(groupId=group['id'], tokenIds=list_tokens, page=1)
      text_selections.append(document_text_selection)
      
      ## build text selection for the NDJson annotation
      entities_annotations_ndjson.update(
        {
          "textSelections": [
            {
              "groupId": group['id'], #id associated with the group of words
              "tokenIds": list_tokens, #id associated with each word in a sentence group
              "page": 1,
            }
          ]
        }
      )
# build your entity annotation with annotation types 
entities_annotation_document_entity = lb_types.DocumentEntity(name="named_entity", 
                                          textSelections = text_selections)
entities_annotation = lb_types.ObjectAnnotation(name="named_entity",
                                                value=entities_annotation_document_entity)
        
print(f"entities_annotations_ndjson={entities_annotations_ndjson}")
print(f"entities_annotation={entities_annotation}")
  

entities_annotations_ndjson={'name': 'named_entity', 'textSelections': [{'groupId': '2f4336f4-a07e-4e0a-a9e1-5629b03b719b', 'tokenIds': ['3f984bf3-1d61-44f5-b59a-9658a2e3440f', '3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8', '6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80', '87a43d32-af76-4a1d-b262-5c5f4d5ace3a', 'e8606e8a-dfd9-4c49-a635-ad5c879c75d0', '67c7c19e-4654-425d-bf17-2adb8cf02c30', '149c5e80-3e07-49a7-ab2d-29ddfe6a38fa', 'b0e94071-2187-461e-8e76-96c58738a52c'], 'page': 1}]}
entities_annotation=confidence=None name='named_entity' feature_schema_id=None extra={} value=DocumentEntity(name='named_entity', text_selections=[DocumentTextSelection(token_ids=['3f984bf3-1d61-44f5-b59a-9658a2e3440f', '3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8', '6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80', '87a43d32-af76-4a1d-b262-5c5f4d5ace3a', 'e8606e8a-dfd9-4c49-a635-ad5c879c75d0', '67c7c19e-4654-425d-bf17-2adb8cf02c30', '149c5e80-3e07-49a7-ab2d-29ddfe6a38fa', 'b0e94071-2187-461e-8e76-96c58738a52c'], group_id='2f4336f4-a07e-4e0a

#### Python annotation
Here we create the complete labels ndjson payload of annotations only using python annotation format. There is one annotation for each reference to an annotation that we created. Note that only a handful of python annotation types are supported for PDF documents.

In [None]:
# create a Label

labels = []
for data_row in dataset.export_data_rows():
  labels.append(lb_types.Label(
      data=lb_types.TextData(
          uid=data_row.uid),
      annotations = [
          entities_annotation,
          checklist_annotation, 
          text_annotation,
          radio_annotation
      ]
  )
)

#### NDJson annotations
Here we create the complete labels ndjson payload of annotations only using NDJSON format. There is one annotation for each reference to an annotation that we created above.

In [None]:

ndjson_annotation = []
for annot in [
    entities_annotations_ndjson,
    bbox_annotation_ndjson,
    text_annotation_ndjson,
    checklist_annotation_ndjson,
    nested_checklist_annotation_ndjson,
    nested_radio_annotation_ndjson,
    radio_annotation_ndjson
  ]:
  annot.update({
      'dataRow': {'id': data_row.uid},
  })
  ndjson_annotation.append(annot)



### Step 6: Import the annotation payload

Option A: Upload to a labeling project as pre-labels (MAL)

In [None]:
upload_job = lb.MALPredictionImport.create_from_objects(
    client = client,
    project_id = project.uid,
    name="pdf_annotation_upload" + str(uuid.uuid4()),
    predictions=ndjson_annotation)

upload_job.wait_until_done()
# Errors will appear for annotation uploads that failed.
print("Errors:", upload_job.errors)

Errors: []


Option B: Upload to a labeling project using ground truth

In [None]:

upload_job = lb.LabelImport.create_from_objects(
    client = client, 
    project_id = project.uid, 
    name="label_import_job"+str(uuid.uuid4()),  
    labels=ndjson_annotation)

print("Errors:", upload_job.errors)

Errors: []
