In [None]:
"""################  PREPARE DATA FOR SPACY STEP 1  ################"""
import json
import spacy
from spacy.gold import GoldCorpus, minibatch, biluo_tags_from_offsets, tags_to_entities


def docs_from_offsets(nlp, gold):
    # create a sequence of docs from a sequence of text, entity-offsets pairs
    docs = []
    for text, entities in gold:
        doc = nlp(text)
        entities = entities['entities']
        tags = biluo_tags_from_offsets(doc, entities)
        if entities:
            for start, end, label in entities:
                span = doc.char_span(start, end, label=label)
                if span:
                    doc.ents = list(doc.ents) + [span]
        if doc.ents:  # remove to return documents without entities too
            docs.append((doc, tags))
    return docs


def docs_to_trees(docs):
    # create spaCy JSON training data from a sequence of docs
    doc_trees = []
    for d, doc_tuple in enumerate(docs):
        doc, tags = doc_tuple
        try:
            tags_to_entities(tags)
        except AssertionError:
            print('Dropping {}'.format(d))
            continue
        if not tags:
            print('Dropping {}'.format(d))
            continue
        sentences = []
        for s in doc.sents:
            s_tokens = []
            for t in s:
                token_data = {
                    'id': t.i,
                    'orth': t.orth_,
                    'tag': t.tag_,
                    'head': t.head.i - t.i,
                    'dep': t.dep_,
                    'ner': tags[t.i],
                }
                s_tokens.append(token_data)
            sentences.append({'tokens': s_tokens})
        doc_trees.append({
            'id': d,
            'paragraphs': [
                {
                    'raw': doc.text,
                    'sentences': sentences,
                }
            ]
        })
    return doc_trees


def train_data_to_json(train_data, nlp, output_json='train.json'):

    offsets = train_data

    docs = docs_from_offsets(nlp, offsets)

    trees = docs_to_trees(docs)

    with open(output_json, 'wt') as f:
        json.dump(trees, f)
    print ('successfully converted to json:',  output_json)

    return True


"""################  PREPARE DATA FOR SPACY STEP 2 ################"""

# load pretrained spacy model and disable disturbing pipeline components:
nlp = spacy.load('de_core_news_sm', disable=["tagger", "ner"])


noNER = 0
for root, dirs, files in os.walk('/home/mareike/final_test_data'):
    for name in files:
        # read output json file from webanno:
        with open('/home/mareike/final_test_data/'+name) as data_file:    
            data = json.load(data_file)
    
        try:
            # extract entity start/end positions and names:
            ent_loc = data['_views']['_InitialView']['NamedEntity']
        except KeyError:
            noNER += 1
            continue
        
        # extract sentence start/end positions:
        Sentence = data['_views']['_InitialView']['Sentence']
        
        # set first sentence starting position 0
        Sentence[0]['begin'] = 0

        # extract original sentences:
        sentences_list = []
        for sent_borders in Sentence:
            tmp_sent_string = ""
            for letter in range(sent_borders["begin"], sent_borders["end"] ):
                tmp_sent_string +=(data['_referenced_fss']['12']['sofaString'][letter])
            sentences_list.append(tmp_sent_string)

        # prepare spacy formatted training data:
        TRAIN_DATA = []
        ent_list = []

        
        for sl in range(len(Sentence)):
            ent_list_sen = []
            for el in range(len(ent_loc)):
                try:
                    if(ent_loc[el]['begin'] >= Sentence[sl]['begin'] and ent_loc[el]['end'] <= Sentence[sl]['end']):
                        ## Need to subtract entity location with sentence begining as webanno generate data by treating document as a whole
                        ent_list_sen.append([(ent_loc[el]['begin']-Sentence[sl]['begin']),(ent_loc[el]['end']-Sentence[sl]['begin']),ent_loc[el]['value']])
                except KeyError:
                    ent_loc[el]['begin'] = 0
                    
                    if(ent_loc[el]['begin'] >= Sentence[sl]['begin'] and ent_loc[el]['end'] <= Sentence[sl]['end']):
                        ## Need to subtract entity location with sentence begining as webanno generate data by treating document as a whole
                        ent_list_sen.append([(ent_loc[el]['begin']-Sentence[sl]['begin']),(ent_loc[el]['end']-Sentence[sl]['begin']),ent_loc[el]['value']])   
                        
            ent_list.append(ent_list_sen)
            ## Create blank dictionary
            ent_dic = {}
            ## Fill value to the dictionary
            ent_dic['entities'] = ent_list[-1]
            ## Prepare final training data
            TRAIN_DATA.append([sentences_list[sl],ent_dic])
        try:    
            train_data_to_json(TRAIN_DATA, nlp,'/home/mareike/final_test_data/spacy_json/'+name)
        except ValueError:
            print(name)
            continue

print('Amount of .json data without named entitys:' +str(noNER) )



    """
    Sources:
    https://www.thinkinfi.com/2020/01/train-custom-ner-spacy.html (modified)
    https://support.prodi.gy/t/prodigy-annotations-to-spacy-train/243/11 (modified)
    """