## Domain specific tokenization
This notebook has basic code for tokenizing an incoming sentence using named entities from a domain specific annotators.
##### How to run:
##### A. Stored annotation response:
1. Set `use_stored_response` = True

##### B. Response from live annotation service:
1. use_stored_response = False
2. An annotation service that will return spans of named entities in a given fragment of text.
3. Populate the variable `request_url` to where annotation service is running.
4. Change request_body as per the API requirements if the selected annotation service in step 2.

In [1]:
import spacy
from spacy import displacy
import requests
import json

In [2]:
sentence="Full-mouth debridement is not payable on the same date of services as other prophylactic or preventive procedures."
request_url = "ANNOTATION_URL"
request_body= {"text": sentence}
use_stored_response=True

In [3]:
stored_response = [{'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/same_date', 'span': {'begin_End': '41 _ 54', 'covered_text': 'the same date', 'lemma': 'the same date', 'begin': 41, 'end': 54}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/Service', 'span': {'begin_End': '58 _ 66', 'covered_text': 'services', 'lemma': 'services', 'begin': 58, 'end': 66}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}, {'annotatorId': 'AnnotatorA, AnnotatorB', 'resourceId': 'http://annotations/custom_procedure_codes_group_preventative_procedures', 'span': {'begin_End': '92 _ 113', 'covered_text': 'preventive procedures', 'lemma': 'preventive procedures', 'begin': 92, 'end': 113}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/hasMutuallyExclusiveNotReimbursableService', 'span': {'begin_End': '26 _ 37', 'covered_text': 'not payable', 'lemma': 'not payable', 'begin': 26, 'end': 37}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': 'mutual exclusive non-reimbursable service'}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/CustomProcedureCodesGroup', 'span': {'begin_End': '103 _ 113', 'covered_text': 'procedures', 'lemma': 'procedures', 'begin': 103, 'end': 113}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}, {'annotatorId': 'WatsonX Entities and Lucene Annotator', 'resourceId': 'http://annotations/custom_procedure_codes_group_prophylactic_procedures', 'span': {'begin_End': '76 _ 88', 'covered_text': 'prophylactic', 'lemma': 'prophylactic', 'begin': 76, 'end': 88}, 'type': 'SIMPLE', 'score': 0.5, 'semanticTypes': ['topp'], 'displayName': None}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/ProcedureCode', 'span': {'begin_End': '103 _ 113', 'covered_text': 'procedures', 'lemma': 'procedures', 'begin': 103, 'end': 113}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}, {'annotatorId': 'AnnotatorB', 'resourceId': 'http://annotations/procedure_code_d4355', 'span': {'begin_End': '0 _ 22', 'covered_text': 'Full-mouth debridement', 'lemma': 'full mouth debridement', 'begin': 0, 'end': 22}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': 'd4355 - full mouth debridement to enable comprehensive evaluation and diagnosis'}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/hasMutuallyExclusiveNotReimbursableService', 'span': {'begin_End': '23 _ 37', 'covered_text': 'is not payable', 'lemma': 'is not payable', 'begin': 23, 'end': 37}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': 'mutual exclusive non-reimbursable service'}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/same_date', 'span': {'begin_End': '45 _ 54', 'covered_text': 'same date', 'lemma': 'same date', 'begin': 45, 'end': 54}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}, {'annotatorId': 'AnnotatorA, AnnotatorB', 'resourceId': 'http://annotations/ServiceCategory', 'span': {'begin_End': '58 _ 66', 'covered_text': 'services', 'lemma': 'services', 'begin': 58, 'end': 66}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': ['ocac'], 'displayName': None}, {'annotatorId': 'AnnotatorB', 'resourceId': 'http://annotations/hasMutuallyExclusiveNotReimbursableService', 'span': {'begin_End': '30 _ 37', 'covered_text': 'payable', 'lemma': 'not payable', 'begin': 30, 'end': 37}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': ['qnco'], 'displayName': 'mutual exclusive non-reimbursable service'}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/hasApplicableService', 'span': {'begin_End': '30 _ 37', 'covered_text': 'payable', 'lemma': 'payable', 'begin': 30, 'end': 37}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': 'applicable service'}, {'annotatorId': 'AnnotatorA', 'resourceId': 'http://annotations/ProcedureCodeGroup', 'span': {'begin_End': '103 _ 113', 'covered_text': 'procedures', 'lemma': 'procedures', 'begin': 103, 'end': 113}, 'type': 'SIMPLE', 'score': 1.0, 'semanticTypes': [], 'displayName': None}]

"""
This method is for detecting whether is returned named entity is multi-word
"""
def is_multi_word(annotation):
    if " " in annotation['span']['covered_text']:
        return True
    return False


"""
This method translates annotated spans to spacy's indices in spacy's DOC object.
"""
def find_annotation_token_indexes(doc,annotation):
    start_found = False
    end_found = False
    merge_token_list_end = None
    merge_token_list_start = None
    for token in doc:
        if(token.idx == annotaion['span']['begin']):
            merge_token_list_start = token.i
            start_found = True
        if start_found and token.text not in annotation['span']['covered_text']:
            end_found = True
            merge_token_list_end = token.i
            break
    return [merge_token_list_start, merge_token_list_end]

"""
This utility method is used to extract only the multi-word tokens from a set of annotations returned by an annotation service.
"""
def get_multi_word_tokens(sentence):
    multi_word_token = []
    ann_response = invoke_annotation_service(sentence)
    selected_annotations = filter_annotations(ann_response)
    for annotation in selected_annotations:
        if is_multi_word(annotation):
            multi_word_token.append(annotation['span']['covered_text'])
    return multi_word_token

"""
Method to invoke annotation service
"""
def invoke_annotation_service(sentence):
    if use_stored_response:
        return stored_response
    else:
        resp = requests.post(request_url,data=json.dumps(request_body), headers={'Content-Type': 'application/json'})
        ann_response= json.loads(resp.content)
        return ann_response
    

"""
Utility method to check if an annotation span has already been covered as part of a larger annotation span
"""
def should_element_be_added(element,annotation):
    decision=True
    for current_selection in annotation:
        if element['span']['begin']>current_selection['span']['begin'] and element['span']['end']<current_selection['span']['end']:
            decision = False
        if element['span']['begin']==current_selection['span']['begin'] and element['span']['end']<current_selection['span']['end']:
            decision = False
        if element['span']['begin']>current_selection['span']['begin'] and element['span']['end']==current_selection['span']['end']:
            decision = False
    return decision


"""
A utility method to extract annotations that are not overlapping with each other.
In case of overlap, this method returns annotation with longest span.
e.g.: If annotation list includes ["same date", "same date of service"]
the method will return ["same date of service"]
"""
def filter_annotations(annotations):
    selected_annotations = []
    covered_annotations = []
    for el in annotations:
        if (el['span']['begin'], el['span']['end']) not in covered_annotations:
            covered_annotations.append((el['span']['begin'], el['span']['end']))
            if (should_element_be_added(el, annotations)):
                selected_annotations.append(el)
    subset = [] 
    for el_outer in selected_annotations:
        appended = False
        for el_inner in selected_annotations:
            start_index_outer = el_outer['span']['begin']
            start_index_inner = el_inner['span']['begin']
            end_index_outer = el_outer['span']['end']
            end_index_inner = el_inner['span']['end']
            if (start_index_inner < start_index_outer < end_index_inner) or (start_index_outer < start_index_inner < end_index_outer):
                if len(el_outer['span']['covered_text'].split(" ")) > len(el_inner['span']['covered_text'].split(" ")):
                    subset.append(el_inner)
                else:
                    subset.append(el_outer)
                appended = True
        if not appended:
            subset.append(el_outer)
            
    return subset


### Detect multi word named entities
This method is used to detect multi-word entities in a given sentence

In [4]:
multi_word_entities = get_multi_word_tokens(sentence)
print("Multi-word entities detected in the sentence are:")
for entity in multi_word_entities:
    print(entity)

Multi-word entities detected in the sentence are:
the same date
preventive procedures
Full-mouth debridement
is not payable


### Retokenize
When multi word entities are detected, retokenize them as single token, prior to building dependency tree

In [5]:
nlp = spacy.load("en_core_web_sm")
doc_original = nlp(sentence)
doc = nlp(sentence)
annotaions = invoke_annotation_service(sentence)
selected_annotations = filter_annotations(annotaions)
for annotaion in selected_annotations:
    if is_multi_word(annotaion):
        [start, end] = find_annotation_token_indexes(doc,annotaion)
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[start:end])

### Dependency Tree prior to retokenization

In [6]:
displacy.render(doc_original)

### Dependency tree after retokenization

In [7]:
displacy.render(doc)