# Load SageMaker GroundTruth annotation

## Load task manifest file mapping text sample IDs to text

In [None]:
import codecs
from dataclasses import dataclass
import json
from pathlib import Path
from typing import Dict, List

import spacy

In [None]:
manifest_path = Path("annotations/glue-dir-kbase-dev-sagemaker-ground-truth-labeling-clone/annotations/intermediate/1/annotations.manifest")

In [None]:
assert manifest_path.is_file()

In [None]:
index_to_text_raw = {}

In [None]:
with open(manifest_path, "rt") as fin:
    for line in fin:
        columns = line.split("\t")
        index = int(columns[0])
        text = "\t".join(columns[1:-1])
        _ = columns[-1]  # no idea what this column is
        assert index not in index_to_text_raw
        index_to_text_raw[index] = text

## Load annotation for given sample text IDs 

In [None]:
annotations_dir = Path("annotations/glue-dir-kbase-dev-sagemaker-ground-truth-labeling-clone/annotations/worker-response/iteration-1/")

In [None]:
assert annotations_dir.is_dir()

In [None]:
index_to_annotation_raw = {}

In [None]:
for annotations_subdir in annotations_dir.iterdir():
    index = int(annotations_subdir.name)
    for i, annotations_file in enumerate(annotations_subdir.iterdir()):
        assert i == 0, f"found more than one annotation in {annotations_subdir}"
        with open(annotations_file, "rt") as fin:
            j = json.load(fin)
        assert index not in index_to_annotation_raw
        answers = j["answers"]
        assert len(answers) == 1
        entities = answers[0]['answerContent']['crowd-entity-annotation']['entities']
        index_to_annotation_raw[index] = entities

## Merge text and annotation whole removing episode IDs from text and adjusting entity positions

In [None]:
@dataclass
class EntityMatch:
    label: str
    start_offset: int
    end_offset: int

In [None]:
@dataclass
class Doc:
    id_: str
    text: str
    annotations: List[EntityMatch]

In [None]:
docs = []
for index, annotation_raw in index_to_annotation_raw.items():
    text_raw = index_to_text_raw[index]
    text = codecs.unicode_escape_decode(text_raw)[0]
    id_, text = text.split("\n", 1)
    id_offset = len(id_) + 1  # +1 due to newline which was stripped of before
    entity_matches = []
    for a in annotation_raw:
        match = EntityMatch(label=a["label"],
                            start_offset=a["startOffset"] - id_offset,
                            end_offset=a["endOffset"] - id_offset)
        entity_matches.append(match)
    doc = Doc(id_=id_, text=text, annotations=entity_matches)
    docs.append(doc)

## Create spacy Doc objects and load entity annotations

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# spacy English NER labels https://spacy.io/models/en#en_core_web_sm-labels
# spacy glossary: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
label_to_spacy_ner_label = {
    "Book": "WORK_OF_ART",
    "Person": "PERSON",
    "Software": "PRODUCT"
}

In [None]:
spacy_docs = []
for d in docs:
    spacy_doc = nlp(d.text)
    # TODO use d.id_
    my_ents = []
    for a in d.annotations:
        ent = spacy_doc.char_span(a.start_offset,
                            a.end_offset,
                            label=label_to_spacy_ner_label[a.label])
        if ent is None:
            # TODO investigate where this happens
            continue
        my_ents.append(ent)
    spacy_doc.set_ents(my_ents, default="unmodified") # TODO keep spacy_doc.ents from default pipeline?
    spacy_docs.append(spacy_doc)

TODO:

- convert to spacy
- visualize for sanity checking