In [None]:
import sys
import json
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training.example import Example
from tqdm import tqdm

In [None]:
def transform_jsonl_for_spacy(path_to_jsonl):
    
    # open and read the jsonl-file
    jsonl_file = open(path_to_jsonl, "r")
    
    lines = jsonl_file.readlines()
    jsonl_list = []
    
    # split the lines of the jsonl-file into text and label
    for line in lines:
        line = json.loads(line)
        if "label" in line:
            line["entities"] = line.pop("label")
        else:
            line["entities"] = []

        tmp_ents = []
        for e in line["entities"]:
            tmp_ents.append(e)
            
        jsonl_list.append((line["data"],{"entities" : tmp_ents}))
    
    # split the data into training and test data
    
    amount_train_data = round(len(jsonl_list)*0.7)
    train_data = jsonl_list[:amount_train_data]
    test_data = jsonl_list[amount_train_data:]

    return test_data, train_data

In [None]:
def update_ner_model(test_data,train_data):
    
    # load spaCy model
    
    nlp = spacy.load('en_core_web_sm')
    
    # load the Named Entity Recognizer of the model and add new labels

    ner = nlp.get_pipe("ner")
    for i in train_data:
        for j in i[1]["entities"]:
            ner.add_label(j[2])

    
    nlp.disable_pipes()

    infixes = list(nlp.Defaults.infixes)
    
    # add - as new split object to the tokenizer
    
    infixes.extend(("-"))
    infix_regex = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_regex.finditer

    # define the pipelines that shall be updated
    
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    optimizer = nlp.resume_training()
    with nlp.disable_pipes(*unaffected_pipes):
    
        # Training with 50 iterations
        for iteration in tqdm(range(70), desc="Iterations"):

            # shuffling examples before every iteration
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                example = []
                for i in range(len(texts)):

                    doc = nlp.make_doc(texts[i])

                    example.append(Example.from_dict(doc, annotations[i]))

                    
                        
                nlp.update(
                    example,
                    sgd=optimizer,
                    drop=0.3,  # dropout
                    losses=losses,
                )
            print("Losses", losses)

    
    output_dir = Path('/path/to/store/model')
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
test_data, train_data = transform_jsonl_for_spacy("/path/to/jsonlfile")

In [None]:
update_ner_model(test_data, train_data)

In [None]:
output_dir = Path('/path/to/model')

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Foxes infected by rabies have a mortility rate of 100% if not vaccined.")