In [1]:
import random
from packaging.version import Version

import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example

import newron.spacy

In [2]:
remote_server_uri = SERVER_URI # set to your server URI
newron.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env
exp_name = "SpacyExample" # set your experiment name
newron.set_experiment(exp_name)

<Experiment: artifact_location='mlflow-artifacts:/48', experiment_id='48', lifecycle_stage='active', name='SpacyExample', tags={}>

In [3]:
IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0 = Version(spacy.__version__) >= Version("3.0.0")

In [4]:
# training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]

In [5]:
nlp = spacy.blank("en")
if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)

# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

params = {"n_iter": 100, "drop": 0.5}
newron.log_params(params)

nlp.begin_training()
for itn in range(params["n_iter"]):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    annotations_list = []
    for batch in batches:
        for text, annotations in batch:
            try:
            # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                annotations_list.append(example)
            except:
                pass
            nlp.update(
                annotations_list,  # batch of texts
                drop=0.2,  # dropout - make it harder to memorise data
                losses=losses,
                )
    print("Losses", losses)
    newron.log_metrics(losses)

# Log the spaCy model using mlflow
newron.spacy.log_model(spacy_model=nlp, artifact_path="model")
model_uri = "runs:/{run_id}/{artifact_path}".format(
    run_id=newron.active_run().info.run_id, artifact_path="model"
)

print("Model saved in run %s" % newron.active_run().info.run_uuid)

# Load the model using mlflow and use it to predict data
nlp2 = newron.spacy.load_model(model_uri=model_uri)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Losses {'ner': 15.053031861782074}
Losses {'ner': 13.220588386058807}
Losses {'ner': 12.571981489658356}
Losses {'ner': 10.113959968090057}
Losses {'ner': 7.883271664381027}
Losses {'ner': 6.2573287934064865}
Losses {'ner': 5.980585306882858}
Losses {'ner': 5.139594436157495}
Losses {'ner': 4.2203461427707225}
Losses {'ner': 7.436829994228901}
Losses {'ner': 6.766827849991387}
Losses {'ner': 5.3287234820418234}
Losses {'ner': 3.131577859574463}
Losses {'ner': 3.355524822487496}
Losses {'ner': 1.3667628129478544}
Losses {'ner': 0.880070379236713}
Losses {'ner': 0.4369672241155058}
Losses {'ner': 0.30358853541110875}
Losses {'ner': 0.0638207234442234}
Losses {'ner': 0.0067083810451435966}
Losses {'ner': 0.00010123948069917788}
Losses {'ner': 3.7485692723882025e-05}
Losses {'ner': 1.4947613554250339e-05}
Losses {'ner': 7.1221805289165e-07}
Losses {'ner': 2.5979704925023495e-07}
Losses {'ner': 1.3994837795182e-07}
Losses {'ner': 3.239396012715066e-08}
Losses {'ner': 1.8645651549526414e-08}



Losses {'ner': 1.1930844045320332e-09}




Model saved in run 47b0c92f89bb4545ad63d5f3ea741290
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities [('Shaka Khan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), ('Khan', 'PERSON', 1), ('?', '', 2)]
