<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# PDF Annotation Import

* Notes:
    * Wait until the import job is complete before opening the Editor to make sure all annotations are imported properly.

In [None]:
!pip install -q 'labelbox[data]'

# Imports

In [None]:
import labelbox as lb
import labelbox.types as lb_types
import uuid
from uuid import uuid4
import json

# API Key and Client
Provide a valid api key below in order to properly connect to the Labelbox Client.

In [None]:
# Add your api key
API_KEY = None
client = lb.Client(api_key=API_KEY)

---- 
### Steps
1. Make sure project is setup
2. Collect annotations
3. Upload

### Step 1: Project setup

First, we create an ontology with all the possible tools and classifications supported for PDF. The official list of supported annotations to import can be found here:
- [Model-Assisted Labeling](https://docs.labelbox.com/docs/model-assisted-labeling) (annotations/labels are not submitted)
- [PDF Annotations](https://docs.labelbox.com/docs/document-annotations)

In [None]:
tool_name = "super"
ontology_builder = lb.OntologyBuilder(
  tools=[ 
    lb.Tool( # Entity tool given the name "NER"
      tool=lb.Tool.Type.NER, 
      name= tool_name)]
    )

In [None]:
ontology = client.create_ontology("pdf-entity-import-ontology", ontology_builder.asdict())

In [None]:
# Create Labelbox project
mal_project = client.create_project(name="pdf_entity_import", media_type=lb.MediaType.Document)

# Create one Labelbox dataset
dataset = client.create_dataset(name="pdf_entity_import_dataset")

# Grab an example pdf and create a Labelbox data row
asset = [
  {
    "row_data": {
      "pdf_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
      "text_layer_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"
    }
  }
]

task = dataset.create_data_rows(asset)
task.wait_till_done()
print(task.errors)




None


In [None]:

data_row_id = next(dataset.data_rows())
# Connect your ontology and editor to your MAL project
mal_project.setup_editor(ontology) 
# Connect your dataset to your MAL project
batch = mal_project.create_batch('test-batch_' + str(uuid4()), [data_row_id] , 5)

### Step 2: create entity annotation

In [None]:
ANNOTATION = {
    "uuid": str(uuid.uuid4()),
    "name" : tool_name,
    "dataRow": {"id": data_row_id.uid},
    "textSelections": [
        {
            "tokenIds": [
                "521f705e-b276-4ac7-8e5b-2e38e037f80f", # superconductivity
            ],
            "groupId": "ed53dd86-ef39-4634-9505-ee0eebedef44",
            "page": 1,
        }
    ],
}

### Step 3 upload annotations

In [None]:
task = lb.MALPredictionImport.create_from_objects(client, mal_project.uid, str(uuid.uuid4()), [ANNOTATION])

In [None]:
task.wait_until_done()

In [None]:
print(task.errors)
print(task.statuses)

[]
[{'uuid': 'dffe7d75-1c61-4dcd-a0d4-804438cf540b', 'dataRow': {'id': 'clb5cdf1c6u0a077d87kffy28'}, 'status': 'SUCCESS'}]
