<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# PDF Annotation Import


Supported annotations for PDF assets 

*Annotation types*
- Checklist classification 
- Radio classifications 
- Free text classifications
- Entities


*NDJson*
- Checklist classification (including nested classifications)
- Radio classifications (including nested classifications)
- Free text classifications
- Bounding box 
- Entities 

### Setup

In [None]:
!pip install -q 'labelbox[data]'

In [None]:
import uuid
import labelbox as lb
import labelbox.types as lb_types
from labelbox.schema.queue_mode import QueueMode

### Replace with your API key
Guides on https://docs.labelbox.com/docs/create-an-api-key

In [None]:
# Add your api key
API_KEY = ""
client = lb.Client(api_key=API_KEY)

### Supported Annotations

In [None]:
########## Entity ##########

# Annotation Types
entities_annotations = lb_types.ObjectAnnotation(
    name="named_entity",
    value= lb_types.DocumentEntity(
        name="named_entity",
        textSelections=[
            lb_types.DocumentTextSelection(
                token_ids=[],
                group_id="",
                page=1
            )
        ]
    )
)

# NDJSON
entities_annotations_ndjson = { 
    "name": "named_entity",
    "textSelections": [
        {
            "tokenIds": [
                "<UUID>",
            ],
            "groupId": "<UUID>",
            "page": 1,
        }
    ]
}

In [None]:
########### Radio Classification #########

# Annotation types 
radio_annotation = lb_types.ClassificationAnnotation(
    name="radio_question",
    value=lb_types.Radio(answer = 
        lb_types.ClassificationAnswer(name = "first_radio_answer")
    )
)
# NDJSON
radio_annotation_ndjson = {
  'name': 'radio_question',
  'answer': {'name': 'first_radio_answer'}
}

In [None]:
############ Checklist Classification ###########

# Annotation types 
checklist_annotation = lb_types.ClassificationAnnotation(
    name="checklist_question",
    value=lb_types.Checklist(answer = [
        lb_types.ClassificationAnswer(name = "first_checklist_answer"),
        lb_types.ClassificationAnswer(name = "second_checklist_answer")
    ])
  )


# NDJSON
checklist_annotation_ndjson = {
  'name': 'checklist_question',
  'answer': [
    {'name': 'first_checklist_answer'},
    {'name': 'second_checklist_answer'}
  ]
}

In [None]:
############ Bounding Box ###########

bbox_annotation_ndjson = {
  'name': 'bounding_box',
  'bbox': {
          "top": 42.799,
          "left": 86.498,
          "height": 141.911,
          "width": 303.195
      },
  'page': 0,
  'unit': "POINTS"
}

In [None]:
# ############ nested classifications ###########

nested_checklist_annotation_ndjson = {
  "name": "nested_checklist_question",
  "answer": [{
      "name": "first_checklist_answer", 
      "classifications" : [
        {
          "name": "sub_checklist_question", 
          "answer": {"name": "first_sub_checklist_answer"}
        }          
      ]         
  }]
}

nested_radio_annotation_ndjson = {
  'name': 'nested_radio_question',
  'answer': {
      'name': 'first_radio_answer',
      'classifications': [{
          'name':'sub_radio_question',
          'answer': { 'name' : 'first_sub_radio_answer'}
        }]
    }
}



In [None]:
############## Classification Free-form text ############## 

text_annotation = lb_types.ClassificationAnnotation(
  name="free_text",  # must match your ontology feature's name
  value=lb_types.Text(answer="sample text")
)


text_annotation_ndjson = {
  'name': 'free_text',
  'answer': 'sample text'
}

In [None]:
######### BBOX with nested classifications #########

bbox_with_radio_subclass_annotation_ndjson = {
  'name': 'bbox_with_radio_subclass',
  'classifications': [
    {
      'name': 'sub_radio_question',
      'answer': {'name': 'first_sub_radio_answer'}
    }
  ],
  'bbox': {
        "top": 214.894,
        "left": 189.215,
        "height": 264,
        "width": 240.573
    },
  'page': 1,
  'unit': "POINTS"
}

In [None]:
############ NER with nested classifications ######## 

ner_with_checklist_subclass_annotation_ndjson = {
  'name': 'ner_with_checklist_subclass',
  'classifications':[
    {
      'name': 'sub_checklist_question',
      'answer': [{'name': 'first_sub_checklist_answer'}] 
    }
  ],
  'textSelections': [
      {
          'tokenIds': [
              '<UUID>'
          ],
          'groupId': '<UUID>',
          'page': 1
      }
  ] 
}
  


In [None]:
######### Relationships ########## 

## Only supported for MAL imports 
uuid_source = str(uuid.uuid4())
uuid_target = str(uuid.uuid4())

entity_source = {
  'name': 'named_entity',
  'uuid': uuid_source,
  'textSelections': [
    {
      'tokenIds': [
        '<UUID>'
      ],
      'groupId': '<UUID>',
      'page': 1
    }
  ]
  
}

entity_target = {
  'name': 'named_entity',
  'uuid': uuid_target,
  'textSelections': [
    {
      'tokenIds': [
        '<UUID>'
      ],
      'groupId': '<UUID>',
      'page': 1
    }
  ]
}
ner_relationship_annotation_ndjson = {
    'name': 'relationship', 
    'relationship': {
      'source': uuid_source,
      'target': uuid_target,
      'type': 'bidirectional'
    }
}




In [None]:
######### BBOX with relationships #############

## Only supported for MAL imports 
uuid_source_2 = str(uuid.uuid4())
uuid_target_2 = str(uuid.uuid4())

bbox_source = {
  'name': 'bounding_box',
  'uuid': uuid_source_2,
  'bbox':  {
            "top": 68.875,
            "left": 188.257,
            "height": 80.681,
            "width": 82.65
        },
  'page': 1,
  'unit': "POINTS"
}

bbox_target = {
  'name': 'bounding_box',
  'uuid': uuid_target_2,
  'bbox':  {
            "top": 66.251,
            "left": 96.424,
            "height": 80.681,
            "width": 82.65
        },
  'page': 1,
  'unit': "POINTS"
}

bbox_relationship_annotation_ndjson = {
    'name': 'relationship', 
    'relationship': {
      'source': uuid_source_2,
      'target': uuid_target_2,
      'type': 'bidirectional'
    }
}

## Upload Annotations - putting it all together 

### Step 1: Import data rows into Catalog 

In [None]:
## Text layer url is required for uploading entity annotations
global_key = "0801.3483.pdf"
img_url = {
    "row_data": {
      "pdf_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
      "text_layer_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"
    },
    "global_key": global_key
}


dataset = client.create_dataset(name="pdf_demo_dataset")
task = dataset.create_data_rows([img_url])
task.wait_till_done()
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)

### Step 2: Create/select an Ontology for your project



In [None]:
## Setup the ontology and link the tools created above.

ontology_builder = lb.OntologyBuilder(
  classifications=[ # List of Classification objects
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO,
      name="radio_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_radio_answer"),
        lb.Option(value="second_radio_answer")
      ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="checklist_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer")
      ]
    ), 
    lb.Classification(
      class_type=lb.Classification.Type.TEXT,
      name="free_text",
      scope = lb.Classification.Scope.GLOBAL
    ),
    lb.Classification(
        class_type=lb.Classification.Type.RADIO,
        name="nested_radio_question",
        scope = lb.Classification.Scope.GLOBAL,
        options=[
            lb.Option("first_radio_answer",
                options=[
                    lb.Classification(
                        class_type=lb.Classification.Type.RADIO,
                        name="sub_radio_question",
                        options=[lb.Option("first_sub_radio_answer")]
                    )
                ]
            )
          ] 
        ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      scope = lb.Classification.Scope.GLOBAL,
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question", 
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ]
        )
      ]
    ),      
  ],
  tools=[ # List of Tool objects
    lb.Tool( 
      tool=lb.Tool.Type.BBOX,
      name="bounding_box"), 
    lb.Tool(
        tool=lb.Tool.Type.NER, 
        name="named_entity"),
    lb.Tool(
        tool=lb.Tool.Type.RELATIONSHIP,
        name="relationship"),
    lb.Tool(
        tool=lb.Tool.Type.NER, 
        name="ner_with_checklist_subclass",
        classifications=[
          lb.Classification(
            class_type=lb.Classification.Type.CHECKLIST,
            name="sub_checklist_question",
            options=[
              lb.Option(value="first_sub_checklist_answer")
            ]
          )
          ]
    ),
    lb.Tool( 
      tool=lb.Tool.Type.BBOX,
      name="bbox_with_radio_subclass",
      classifications=[
            lb.Classification(
                class_type=lb.Classification.Type.RADIO,
                name="sub_radio_question",
                options=[
                  lb.Option(value="first_sub_radio_answer")
                ]
              )
        ]
      )
    ]
)

ontology = client.create_ontology("Document Annotation Import Demo",
                                  ontology_builder.asdict(),
                                  media_type=lb.MediaType.Document)

### Step 3: Creating a labeling project

In [None]:
# Create a Labelbox project
project = client.create_project(name="PDF_annotation_demo",                                    
                                    queue_mode=QueueMode.Batch,
                                    media_type=lb.MediaType.Document)
project.setup_editor(ontology)

### Step 4: Send a batch of data rows to the project

In [None]:
project.create_batch(
  "PDF_annotation_batch", # Each batch in a project must have a unique name
  global_keys=[global_key], # Paginated collection of data row objects, list of data row ids or global keys
  priority=5 # priority between 1(Highest) - 5(lowest)
)

### Step 5. Create the annotation payload
Create the annotations payload using the snippets of code in Supported predictions section.

Labelbox support NDJSON only for this data type.

The resulting label should have exactly the same content for annotations that are supported by both (with exception of the uuid strings that are generated)

##### First, we need to populate the text selections for Entity annotations

In [None]:
import requests
import json


# To learn how to generate a text layer for your documents please refer to the following repositories/files: 
# https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/gcloud/gcp-vision-to-lb-text-layer.py
# https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/adobe/adobe-ocr-to-lb-text-layer.py

text_layer = "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"

# Fetch the content of the text layer
res = requests.get(text_layer) 

content_phrases = ["Metal-insulator (MI) transitions have been one of the" , 
                   "T. Sasaki,* N. Yoneyama, and N. Kobayashi", 
                   "Organic charge transfer salts based on the donor",
                   "the experimental investigations on this issue have not"]

# Parse the text layer

text_selections = []

for obj in json.loads(res.text):
  for group in obj['groups']:
    if group['content'] == content_phrases[0]:
      list_tokens = [x['id'] for x in group['tokens']]
        #build text selections for Annotation Types
      document_text_selection = lb_types.DocumentTextSelection(groupId=group['id'], tokenIds=list_tokens, page=1)
      text_selections.append(document_text_selection)
      # build text selection for the NDJson annotation
      entities_annotations_ndjson.update(
        {
          'textSelections': [
            {
              'groupId': group['id'], #id associated with the group of words
              'tokenIds': list_tokens, #id associated with each word in a sentence group
              'page': 1,
            }
          ]}
      )
    if group['content'] == content_phrases[1]:
      list_tokens_2 = [x['id'] for x in group['tokens']]
      ner_with_checklist_subclass_annotation_ndjson.update(
        {
          'textSelections': [
            {
              'groupId': group['id'], #id associated with the group of words
              'tokenIds': list_tokens_2, #id associated with each word in a sentence group
              'page': 1,
            }
          ]
        }
      )
    if group['content'] == content_phrases[2]:
      relationship_source = [x['id'] for x in group['tokens']]
      entity_source.update(
        {
        'textSelections': [
            {
              'groupId': group["id"],
              'tokenIds': relationship_source,
              'page':1
            }
          ]
        }
      )
    if group['content'] == content_phrases[3]:
        relationship_target = [x['id'] for x in group['tokens']]
        entity_target.update(
          {
          'textSelections': [
              {
                'groupId': group["id"],
                'tokenIds': relationship_target,
                'page':1
              }
            ]
          }
        )

    
      
#re-write the entity annotation with text selections (annotation types)
entities_annotation_document_entity = lb_types.DocumentEntity(name="named_entity", 
                                          textSelections = text_selections)
entities_annotation = lb_types.ObjectAnnotation(name="named_entity",
                                                value=entities_annotation_document_entity)

        
print(f"entities_annotations_ndjson={entities_annotations_ndjson}")
print(f"entities_annotation={entities_annotation}")
print(f"nested_entities_annotation={ner_with_checklist_subclass_annotation_ndjson}")
print(f"entity_source={entity_source}")
print(f"entity_target={entity_target}")
  

#### Python annotation
Here we create the complete labels ndjson payload of annotations only using python annotation format. There is one annotation for each reference to an annotation that we created. Note that only a handful of python annotation types are supported for PDF documents.

In [None]:


labels = []

labels.append(
    lb_types.Label(
        data=lb_types.DocumentData(
            global_key=global_key),
        annotations = [
            entities_annotation,
            checklist_annotation, 
            text_annotation,
            radio_annotation
        ]
  )
)

#### NDJson annotations
Here we create the complete labels ndjson payload of annotations only using NDJSON format. There is one annotation for each reference to an annotation that we created above.

In [None]:

label_ndjson = []
for annot in [
    entities_annotations_ndjson,
    bbox_annotation_ndjson,
    text_annotation_ndjson,
    checklist_annotation_ndjson,
    nested_checklist_annotation_ndjson,
    bbox_with_radio_subclass_annotation_ndjson,
    ner_with_checklist_subclass_annotation_ndjson,
    nested_radio_annotation_ndjson,
    radio_annotation_ndjson,
    entity_source, 
    entity_target, 
    ner_relationship_annotation_ndjson, # Only supported for MAL imports 
    bbox_source,
    bbox_target,
    bbox_relationship_annotation_ndjson, # Only supported for MAL imports
    
    
  ]:
  annot.update({
      'dataRow': {'globalKey': global_key},
  })
  label_ndjson.append(annot)



In [None]:
entity_source
entity_target

### Step 6: Import the annotation payload

Option A: Upload to a labeling project as pre-labels (MAL)

In [None]:
upload_job = lb.MALPredictionImport.create_from_objects(
    client = client,
    project_id = project.uid,
    name="pdf_annotation_upload" + str(uuid.uuid4()),
    predictions=label_ndjson)

upload_job.wait_until_done()
# Errors will appear for annotation uploads that failed.
print("Errors:", upload_job.errors)
print("Status of uploads: ", upload_job.statuses)

Option B: Upload to a labeling project using ground truth

In [None]:
# Uncomment this code when excluding relationships from label import

# upload_job = lb.LabelImport.create_from_objects(
#     client = client, 
#     project_id = project.uid, 
#     name="label_import_job"+str(uuid.uuid4()),  
#     labels=label_ndjson)

# print("Errors:", upload_job.errors)
# print("Status of uploads: ", upload_job.statuses)