In [1]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")  # load a new spacy model
doc_bin = DocBin()  # create a DocBin object


In [2]:


from spacy.util import filter_spans
import generate
from tqdm import tqdm

data = generate.generate_train_data()

for obj in tqdm(data):
    text = obj[0] + '\n'
    labels = obj[1]['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="expand")
        if span is None:
            print("Skipping entity ({}-{}) in {}".format(start, end, text))
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy")  # save the docbin object

100%|██████████| 20/20 [00:00<00:00, 47.95it/s]


In [3]:
# use the training data to train the ner model
import spacy
import random
import warnings
from spacy.training import Example
from spacy.util import minibatch, compounding
from tqdm import tqdm

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Create a blank NER model and add it to the pipeline
ner = nlp.get_pipe("ner")

# Add your custom entity labels here
labels = [ent[2] for d in data for ent in d[1]['entities']]

# Add the labels to the NER model
for label in labels:
    ner.add_label(label)


In [7]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.select_pipes(enable=["ner"]), warnings.catch_warnings():
    # Show warnings for misaligned entity spans once
    warnings.filterwarnings("once", category=UserWarning, module='spacy')

    # Reset and initialize the weights randomly – but only if we're
    # training a new model
    # optimizer = nlp.begin_training()
    for itn in tqdm(range(200)):
        random.shuffle(data)
        losses = {}
        # Batch up the examples using spaCy's minibatch
        batches = minibatch(data, size=compounding(8.0, 32.0, 1.01))
        for batch in batches:
            texts, annotations = zip(*batch)
            example = []
            # Update the example format to match spaCy v3
            for i in range(len(texts)):
                doc = nlp.make_doc(texts[i])
                example.append(Example.from_dict(doc, annotations[i]))

            # Update the model with the examples
            nlp.update(example, drop=0.2, losses=losses)
        print("Losses", losses)


<clients>
 ..." with entities "[(88, 94, 'PERSON'), (206, 213, 'PERSON'), (712, 7...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
<users>
   ..." with entities "[(85, 91, 'PERSON'), (224, 259, 'ADDRESS'), (367, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

..." with entities "[(610, 635, 'IBAN'), (645, 656, 'PERSON'), (664, 6...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

This PowerShe..." with entities "[(1465, 1480, 'CLIENT_NUMBER'), (1488, 1515, 'IBAN...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  0%|          | 1/200 [00:03<10:47

Losses {'ner': 3.0249868371234387}


  1%|          | 2/200 [00:06<10:55,  3.31s/it]

Losses {'ner': 6.46628869699237}


  2%|▏         | 3/200 [00:09<10:34,  3.22s/it]

Losses {'ner': 5.265929386200149}


  2%|▏         | 4/200 [00:13<10:40,  3.27s/it]

Losses {'ner': 8.103600280301329}


  2%|▎         | 5/200 [00:16<10:27,  3.22s/it]

Losses {'ner': 7.357366404873756}


  3%|▎         | 6/200 [00:19<10:30,  3.25s/it]

Losses {'ner': 8.57240539086185}


  4%|▎         | 7/200 [00:22<10:32,  3.27s/it]

Losses {'ner': 3.933787643212845}


  4%|▍         | 8/200 [00:26<10:28,  3.27s/it]

Losses {'ner': 5.437249452910953}


  4%|▍         | 9/200 [00:29<10:11,  3.20s/it]

Losses {'ner': 4.973084340616088}


  5%|▌         | 10/200 [00:32<09:58,  3.15s/it]

Losses {'ner': 4.075626789511303}


  6%|▌         | 11/200 [00:35<09:49,  3.12s/it]

Losses {'ner': 6.866458593764579}


  6%|▌         | 12/200 [00:38<09:42,  3.10s/it]

Losses {'ner': 3.37848346196767}


  6%|▋         | 13/200 [00:41<09:48,  3.14s/it]

Losses {'ner': 5.435799358709532}


  7%|▋         | 14/200 [00:44<09:48,  3.17s/it]

Losses {'ner': 4.616521097834192}


  8%|▊         | 15/200 [00:47<09:34,  3.11s/it]

Losses {'ner': 4.313459967678545}


  8%|▊         | 16/200 [00:50<09:37,  3.14s/it]

Losses {'ner': 6.029252291704429}


  8%|▊         | 17/200 [00:53<09:24,  3.09s/it]

Losses {'ner': 3.3906226578781}


  9%|▉         | 18/200 [00:57<09:27,  3.12s/it]

Losses {'ner': 5.698492743969187}


 10%|▉         | 19/200 [01:00<09:23,  3.12s/it]

Losses {'ner': 2.959430170151754}


 10%|█         | 20/200 [01:03<09:12,  3.07s/it]

Losses {'ner': 8.237986823036367}


 10%|█         | 21/200 [01:06<09:05,  3.05s/it]

Losses {'ner': 5.577628580975331}


 11%|█         | 22/200 [01:09<08:58,  3.03s/it]

Losses {'ner': 4.177839305555052}


 12%|█▏        | 23/200 [01:12<09:10,  3.11s/it]

Losses {'ner': 6.160040520391774}


 12%|█▏        | 24/200 [01:15<09:05,  3.10s/it]

Losses {'ner': 6.886724871361087}


 12%|█▎        | 25/200 [01:18<08:51,  3.03s/it]

Losses {'ner': 6.110032821213332}


 13%|█▎        | 26/200 [01:21<08:46,  3.02s/it]

Losses {'ner': 6.220980840686572}


 14%|█▎        | 27/200 [01:24<08:44,  3.03s/it]

Losses {'ner': 11.799830463587591}


 14%|█▍        | 28/200 [01:27<08:38,  3.01s/it]

Losses {'ner': 6.8177326806068095}


 14%|█▍        | 29/200 [01:30<08:43,  3.06s/it]

Losses {'ner': 2.7119688919198937}


 15%|█▌        | 30/200 [01:33<08:39,  3.06s/it]

Losses {'ner': 9.297152569492669}


 16%|█▌        | 31/200 [01:36<08:32,  3.03s/it]

Losses {'ner': 4.107393736616955}


 16%|█▌        | 32/200 [01:39<08:35,  3.07s/it]

Losses {'ner': 9.316344624337546}


 16%|█▋        | 33/200 [01:43<08:46,  3.15s/it]

Losses {'ner': 14.013104105521554}


 17%|█▋        | 34/200 [01:46<08:42,  3.14s/it]

Losses {'ner': 6.817954847779461}


 18%|█▊        | 35/200 [01:49<08:42,  3.17s/it]

Losses {'ner': 4.525564870516726}


 18%|█▊        | 36/200 [01:52<08:51,  3.24s/it]

Losses {'ner': 5.593417825462014}


 18%|█▊        | 37/200 [01:56<08:59,  3.31s/it]

Losses {'ner': 4.306165496463656}


 19%|█▉        | 38/200 [01:59<09:02,  3.35s/it]

Losses {'ner': 2.6540663832086246}


 20%|█▉        | 39/200 [02:02<08:51,  3.30s/it]

Losses {'ner': 5.567167345567213}


 20%|██        | 40/200 [02:06<08:58,  3.37s/it]

Losses {'ner': 3.6024924113924346}


 20%|██        | 41/200 [02:09<08:49,  3.33s/it]

Losses {'ner': 5.588042555853063}


 21%|██        | 42/200 [02:13<08:46,  3.33s/it]

Losses {'ner': 2.342558738507548}


 22%|██▏       | 43/200 [02:16<08:39,  3.31s/it]

Losses {'ner': 4.5202995562859}


 22%|██▏       | 44/200 [02:19<08:31,  3.28s/it]

Losses {'ner': 3.310169361725133}


 22%|██▎       | 45/200 [02:23<08:40,  3.36s/it]

Losses {'ner': 4.1157019237196515}


 23%|██▎       | 46/200 [02:26<08:33,  3.33s/it]

Losses {'ner': 4.0909070367277955}


 24%|██▎       | 47/200 [02:29<08:35,  3.37s/it]

Losses {'ner': 2.5648823480143865}


 24%|██▍       | 48/200 [02:33<08:33,  3.38s/it]

Losses {'ner': 5.66908818618371}


 24%|██▍       | 49/200 [02:36<08:19,  3.30s/it]

Losses {'ner': 4.879244850105728}


 25%|██▌       | 50/200 [02:39<08:17,  3.32s/it]

Losses {'ner': 4.863604700177751}


 26%|██▌       | 51/200 [02:42<08:13,  3.31s/it]

Losses {'ner': 4.457398087521338}


 26%|██▌       | 52/200 [02:46<08:04,  3.27s/it]

Losses {'ner': 4.1841809887564825}


 26%|██▋       | 53/200 [02:49<07:56,  3.24s/it]

Losses {'ner': 8.775915069340508}


 27%|██▋       | 54/200 [02:52<08:03,  3.31s/it]

Losses {'ner': 3.401375125938685}


 28%|██▊       | 55/200 [02:56<08:08,  3.37s/it]

Losses {'ner': 3.994249855064045}


 28%|██▊       | 56/200 [02:59<08:03,  3.36s/it]

Losses {'ner': 6.472469732742148}


 28%|██▊       | 57/200 [03:03<08:18,  3.49s/it]

Losses {'ner': 7.318341036138005}


 29%|██▉       | 58/200 [03:06<08:11,  3.46s/it]

Losses {'ner': 5.117199817908809}


 30%|██▉       | 59/200 [03:10<08:12,  3.49s/it]

Losses {'ner': 6.285657908981914}


 30%|███       | 60/200 [03:13<08:13,  3.53s/it]

Losses {'ner': 6.6161634506294575}


 30%|███       | 61/200 [03:17<08:11,  3.54s/it]

Losses {'ner': 14.317190556007656}


 31%|███       | 62/200 [03:21<08:24,  3.66s/it]

Losses {'ner': 8.985327065009004}


 32%|███▏      | 63/200 [03:24<08:09,  3.57s/it]

Losses {'ner': 9.452720783271278}


 32%|███▏      | 64/200 [03:28<08:07,  3.58s/it]

Losses {'ner': 7.16730288047643}


 32%|███▎      | 65/200 [03:31<07:44,  3.44s/it]

Losses {'ner': 22.181738283160286}


 33%|███▎      | 66/200 [03:34<07:26,  3.33s/it]

Losses {'ner': 5.003576529579623}


 34%|███▎      | 67/200 [03:37<07:20,  3.31s/it]

Losses {'ner': 6.0504478063309985}


 34%|███▍      | 68/200 [03:41<07:13,  3.28s/it]

Losses {'ner': 3.5072339125687324}


 34%|███▍      | 69/200 [03:44<06:58,  3.19s/it]

Losses {'ner': 5.0466316698520295}


 35%|███▌      | 70/200 [03:47<06:49,  3.15s/it]

Losses {'ner': 4.214563058710717}


 36%|███▌      | 71/200 [03:50<06:49,  3.18s/it]

Losses {'ner': 20.226321904110524}


 36%|███▌      | 72/200 [03:53<06:52,  3.22s/it]

Losses {'ner': 11.066064748645033}


 36%|███▋      | 73/200 [03:57<07:01,  3.32s/it]

Losses {'ner': 4.576197248458968}


 37%|███▋      | 74/200 [04:01<07:16,  3.46s/it]

Losses {'ner': 17.482618580632316}


 38%|███▊      | 75/200 [04:05<07:37,  3.66s/it]

Losses {'ner': 4.852788720965076}


 38%|███▊      | 76/200 [04:08<07:27,  3.61s/it]

Losses {'ner': 2.4171160255499773}


 38%|███▊      | 77/200 [04:12<07:33,  3.69s/it]

Losses {'ner': 4.46637390214289}


 39%|███▉      | 78/200 [04:15<07:12,  3.55s/it]

Losses {'ner': 7.943155989339298}


 40%|███▉      | 79/200 [04:18<06:54,  3.42s/it]

Losses {'ner': 5.025790336181758}


 40%|████      | 80/200 [04:21<06:35,  3.29s/it]

Losses {'ner': 9.577534236757346}


 40%|████      | 81/200 [04:24<06:23,  3.22s/it]

Losses {'ner': 9.45012312810587}


 41%|████      | 82/200 [04:28<06:17,  3.20s/it]

Losses {'ner': 7.794633696954253}


 42%|████▏     | 83/200 [04:31<06:30,  3.34s/it]

Losses {'ner': 4.374773789510777}


 42%|████▏     | 84/200 [04:35<06:29,  3.36s/it]

Losses {'ner': 5.690201079926956}


 42%|████▎     | 85/200 [04:38<06:33,  3.42s/it]

Losses {'ner': 6.635502128963616}


 43%|████▎     | 86/200 [04:41<06:18,  3.32s/it]

Losses {'ner': 4.495847166404372}


 44%|████▎     | 87/200 [04:44<06:04,  3.23s/it]

Losses {'ner': 12.576801847734403}


 44%|████▍     | 88/200 [04:47<05:56,  3.18s/it]

Losses {'ner': 3.2164370910905795}


 44%|████▍     | 89/200 [04:51<05:51,  3.16s/it]

Losses {'ner': 11.42227788254915}


 45%|████▌     | 90/200 [04:54<05:46,  3.15s/it]

Losses {'ner': 6.357158625404328}


 46%|████▌     | 91/200 [04:57<05:44,  3.17s/it]

Losses {'ner': 7.448497487982544}


 46%|████▌     | 92/200 [05:00<05:42,  3.17s/it]

Losses {'ner': 3.399705386462539}


 46%|████▋     | 93/200 [05:04<05:50,  3.28s/it]

Losses {'ner': 4.4781917383923515}


 47%|████▋     | 94/200 [05:07<05:38,  3.19s/it]

Losses {'ner': 4.310209505223949}


 48%|████▊     | 95/200 [05:09<05:26,  3.11s/it]

Losses {'ner': 4.610148679521899}


 48%|████▊     | 96/200 [05:13<05:23,  3.11s/it]

Losses {'ner': 5.204606576532929}


 48%|████▊     | 97/200 [05:16<05:18,  3.09s/it]

Losses {'ner': 2.886544840273562}


 49%|████▉     | 98/200 [05:19<05:13,  3.07s/it]

Losses {'ner': 18.956393791030678}


 50%|████▉     | 99/200 [05:22<05:09,  3.07s/it]

Losses {'ner': 10.575226883972809}


 50%|█████     | 100/200 [05:25<05:08,  3.09s/it]

Losses {'ner': 3.0071528744190394}


 50%|█████     | 101/200 [05:28<05:08,  3.11s/it]

Losses {'ner': 8.197704866150318}


 51%|█████     | 102/200 [05:31<05:05,  3.12s/it]

Losses {'ner': 4.441149080219302}


 52%|█████▏    | 103/200 [05:34<05:00,  3.09s/it]

Losses {'ner': 9.99817227533514}


 52%|█████▏    | 104/200 [05:38<05:04,  3.17s/it]

Losses {'ner': 16.038963173230734}


 52%|█████▎    | 105/200 [05:40<04:53,  3.09s/it]

Losses {'ner': 18.077378968964005}


 53%|█████▎    | 106/200 [05:43<04:46,  3.05s/it]

Losses {'ner': 4.580187170603074}


 54%|█████▎    | 107/200 [05:47<04:47,  3.09s/it]

Losses {'ner': 7.128382373988082}


 54%|█████▍    | 108/200 [05:50<04:40,  3.04s/it]

Losses {'ner': 9.427552750026951}


 55%|█████▍    | 109/200 [05:53<04:36,  3.04s/it]

Losses {'ner': 3.2054759131783133}


 55%|█████▌    | 110/200 [05:56<04:54,  3.27s/it]

Losses {'ner': 5.3087561546050654}


 56%|█████▌    | 111/200 [06:00<05:04,  3.42s/it]

Losses {'ner': 5.073674636026977}


 56%|█████▌    | 112/200 [06:03<04:56,  3.37s/it]

Losses {'ner': 9.173292029726813}


 56%|█████▋    | 113/200 [06:07<04:50,  3.34s/it]

Losses {'ner': 8.244122409146302}


 57%|█████▋    | 114/200 [06:10<04:40,  3.27s/it]

Losses {'ner': 5.668769044501929}


 57%|█████▊    | 115/200 [06:13<04:34,  3.23s/it]

Losses {'ner': 7.133483028585807}


 58%|█████▊    | 116/200 [06:16<04:30,  3.23s/it]

Losses {'ner': 5.5393262339384695}


 58%|█████▊    | 117/200 [06:19<04:23,  3.18s/it]

Losses {'ner': 10.168015006235732}


 59%|█████▉    | 118/200 [06:22<04:17,  3.13s/it]

Losses {'ner': 6.559280498023889}


 60%|█████▉    | 119/200 [06:25<04:12,  3.11s/it]

Losses {'ner': 5.502921920742884}


 60%|██████    | 120/200 [06:29<04:17,  3.22s/it]

Losses {'ner': 8.414741433113056}


 60%|██████    | 121/200 [06:32<04:12,  3.19s/it]

Losses {'ner': 4.620152798451589}


 61%|██████    | 122/200 [06:35<04:06,  3.16s/it]

Losses {'ner': 4.000648728198135}


 62%|██████▏   | 123/200 [06:38<04:08,  3.22s/it]

Losses {'ner': 7.061458822880439}


 62%|██████▏   | 124/200 [06:42<04:10,  3.29s/it]

Losses {'ner': 11.190753130781225}


 62%|██████▎   | 125/200 [06:45<04:01,  3.23s/it]

Losses {'ner': 2.7181023680441703}


 63%|██████▎   | 126/200 [06:48<03:54,  3.17s/it]

Losses {'ner': 4.790344129306142}


 64%|██████▎   | 127/200 [06:51<03:57,  3.26s/it]

Losses {'ner': 5.40954896954431}


 64%|██████▍   | 128/200 [06:54<03:51,  3.21s/it]

Losses {'ner': 3.7345202194844194}


 64%|██████▍   | 129/200 [06:57<03:43,  3.15s/it]

Losses {'ner': 3.7775283241334447}


 65%|██████▌   | 130/200 [07:01<03:40,  3.15s/it]

Losses {'ner': 15.45877263880767}


 66%|██████▌   | 131/200 [07:04<03:37,  3.15s/it]

Losses {'ner': 5.555609098515637}


 66%|██████▌   | 132/200 [07:07<03:31,  3.11s/it]

Losses {'ner': 4.733143665496716}


 66%|██████▋   | 133/200 [07:10<03:27,  3.10s/it]

Losses {'ner': 4.711913387030267}


 67%|██████▋   | 134/200 [07:13<03:25,  3.11s/it]

Losses {'ner': 3.1784842441130454}


 68%|██████▊   | 135/200 [07:16<03:27,  3.20s/it]

Losses {'ner': 9.623069169542362}


 68%|██████▊   | 136/200 [07:20<03:25,  3.22s/it]

Losses {'ner': 5.5064815310552655}


 68%|██████▊   | 137/200 [07:23<03:23,  3.23s/it]

Losses {'ner': 11.540385347760946}


 69%|██████▉   | 138/200 [07:26<03:18,  3.21s/it]

Losses {'ner': 9.064567377394484}


 70%|██████▉   | 139/200 [07:29<03:17,  3.23s/it]

Losses {'ner': 3.769824592477279}


 70%|███████   | 140/200 [07:32<03:12,  3.20s/it]

Losses {'ner': 4.735119331344837}


 70%|███████   | 141/200 [07:36<03:06,  3.16s/it]

Losses {'ner': 5.9846604970437935}


 71%|███████   | 142/200 [07:39<03:06,  3.22s/it]

Losses {'ner': 3.908404305769847}


 72%|███████▏  | 143/200 [07:42<03:06,  3.28s/it]

Losses {'ner': 4.75128663909583}


 72%|███████▏  | 144/200 [07:46<03:07,  3.34s/it]

Losses {'ner': 8.045053199052346}


 72%|███████▎  | 145/200 [07:49<03:04,  3.35s/it]

Losses {'ner': 11.231485981213828}


 73%|███████▎  | 146/200 [07:53<03:02,  3.38s/it]

Losses {'ner': 7.095278747990108}


 74%|███████▎  | 147/200 [07:56<03:01,  3.43s/it]

Losses {'ner': 7.237649632435614}


 74%|███████▍  | 148/200 [07:59<02:54,  3.36s/it]

Losses {'ner': 17.65366806117297}


 74%|███████▍  | 149/200 [08:03<02:48,  3.30s/it]

Losses {'ner': 7.983837218408688}


 75%|███████▌  | 150/200 [08:06<02:44,  3.30s/it]

Losses {'ner': 5.6167044578783605}


 76%|███████▌  | 151/200 [08:09<02:45,  3.37s/it]

Losses {'ner': 6.331850037640297}


 76%|███████▌  | 152/200 [08:12<02:37,  3.28s/it]

Losses {'ner': 8.927479936945293}


 76%|███████▋  | 153/200 [08:16<02:34,  3.29s/it]

Losses {'ner': 5.084482151103956}


 77%|███████▋  | 154/200 [08:19<02:26,  3.19s/it]

Losses {'ner': 4.438770975777208}


 78%|███████▊  | 155/200 [08:22<02:30,  3.34s/it]

Losses {'ner': 8.889049634264227}


 78%|███████▊  | 156/200 [08:26<02:24,  3.29s/it]

Losses {'ner': 8.957413804563213}


 78%|███████▊  | 157/200 [08:29<02:24,  3.37s/it]

Losses {'ner': 7.888024884693255}


 79%|███████▉  | 158/200 [08:33<02:22,  3.38s/it]

Losses {'ner': 5.436540320103793}


 80%|███████▉  | 159/200 [08:36<02:15,  3.32s/it]

Losses {'ner': 5.56060393903671}


 80%|████████  | 160/200 [08:39<02:16,  3.40s/it]

Losses {'ner': 3.92648646949037}


 80%|████████  | 161/200 [08:43<02:15,  3.47s/it]

Losses {'ner': 10.219012353726125}


 81%|████████  | 162/200 [08:46<02:10,  3.44s/it]

Losses {'ner': 4.502068890576814}


 82%|████████▏ | 163/200 [08:50<02:09,  3.51s/it]

Losses {'ner': 7.568395696534501}


 82%|████████▏ | 164/200 [08:53<02:03,  3.44s/it]

Losses {'ner': 5.570052894425665}


 82%|████████▎ | 165/200 [08:57<01:58,  3.40s/it]

Losses {'ner': 21.567421770195423}


 83%|████████▎ | 166/200 [09:00<01:52,  3.32s/it]

Losses {'ner': 15.698579885907657}


 84%|████████▎ | 167/200 [09:03<01:47,  3.25s/it]

Losses {'ner': 23.16570203764654}


 84%|████████▍ | 168/200 [09:06<01:45,  3.31s/it]

Losses {'ner': 7.1242575213439725}


 84%|████████▍ | 169/200 [09:10<01:42,  3.32s/it]

Losses {'ner': 5.9392438480594825}


 85%|████████▌ | 170/200 [09:13<01:36,  3.23s/it]

Losses {'ner': 4.956425777949953}


 86%|████████▌ | 171/200 [09:16<01:32,  3.18s/it]

Losses {'ner': 8.892665141785947}


 86%|████████▌ | 172/200 [09:19<01:30,  3.24s/it]

Losses {'ner': 7.908341876624497}


 86%|████████▋ | 173/200 [09:22<01:26,  3.20s/it]

Losses {'ner': 5.673836005798101}


 87%|████████▋ | 174/200 [09:25<01:23,  3.21s/it]

Losses {'ner': 5.043799400121049}


 88%|████████▊ | 175/200 [09:29<01:25,  3.42s/it]

Losses {'ner': 6.001995869762179}


 88%|████████▊ | 176/200 [09:33<01:21,  3.41s/it]

Losses {'ner': 3.263315357640743}


 88%|████████▊ | 177/200 [09:36<01:20,  3.51s/it]

Losses {'ner': 1.7522350794627017}


 89%|████████▉ | 178/200 [09:40<01:16,  3.46s/it]

Losses {'ner': 20.05621042064601}


 90%|████████▉ | 179/200 [09:43<01:12,  3.43s/it]

Losses {'ner': 7.455674087362053}


 90%|█████████ | 180/200 [09:47<01:10,  3.51s/it]

Losses {'ner': 2.376647040610473}


 90%|█████████ | 181/200 [09:50<01:05,  3.47s/it]

Losses {'ner': 9.2112826523836}


 91%|█████████ | 182/200 [09:54<01:03,  3.55s/it]

Losses {'ner': 4.523123417952745}


 92%|█████████▏| 183/200 [09:58<01:01,  3.61s/it]

Losses {'ner': 4.219324247000795}


 92%|█████████▏| 184/200 [10:01<00:55,  3.46s/it]

Losses {'ner': 12.801049050195882}


 92%|█████████▎| 185/200 [10:04<00:49,  3.33s/it]

Losses {'ner': 3.7035720035022535}


 93%|█████████▎| 186/200 [10:07<00:45,  3.25s/it]

Losses {'ner': 5.708509203783809}


 94%|█████████▎| 187/200 [10:10<00:42,  3.25s/it]

Losses {'ner': 23.419996544915783}


 94%|█████████▍| 188/200 [10:13<00:38,  3.21s/it]

Losses {'ner': 13.244563993406995}


 94%|█████████▍| 189/200 [10:16<00:34,  3.15s/it]

Losses {'ner': 9.687665129799964}


 95%|█████████▌| 190/200 [10:20<00:31,  3.19s/it]

Losses {'ner': 14.81300410476067}


 96%|█████████▌| 191/200 [10:23<00:29,  3.24s/it]

Losses {'ner': 10.724858326667846}


 96%|█████████▌| 192/200 [10:26<00:25,  3.16s/it]

Losses {'ner': 9.3937645310287}


 96%|█████████▋| 193/200 [10:30<00:23,  3.39s/it]

Losses {'ner': 17.10308938538329}


 97%|█████████▋| 194/200 [10:33<00:20,  3.47s/it]

Losses {'ner': 24.55660577323265}


 98%|█████████▊| 195/200 [10:37<00:17,  3.55s/it]

Losses {'ner': 37.2480897676153}


 98%|█████████▊| 196/200 [10:41<00:14,  3.62s/it]

Losses {'ner': 11.19316208156691}


 98%|█████████▊| 197/200 [10:44<00:10,  3.53s/it]

Losses {'ner': 13.386902632940092}


 99%|█████████▉| 198/200 [10:48<00:07,  3.51s/it]

Losses {'ner': 6.287819857508086}


100%|█████████▉| 199/200 [10:52<00:03,  3.59s/it]

Losses {'ner': 10.137483094436826}


100%|██████████| 200/200 [10:55<00:00,  3.28s/it]

Losses {'ner': 15.619288982603237}





In [23]:
# test the trained model
text = """
alpha2,English
aa,Afar
ab,Abkhazian
ae,Avestan
af,Afrikaans
ak,Akan
am,Amharic
an,Aragonese
ar,Arabic
as,Assamese
av,Avaric
ay,Aymara
az,Azerbaijani
ba,Bashkir
be,Belarusian
bg,Bulgarian
bh,Bihari languages
bi,Bislama
bm,Bambara
bn,Bengali
bo,Tibetan
br,Breton
bs,Bosnian
ca,Catalan; Valencian
ce,Chechen
ch,Chamorro
co,Corsican
cr,Cree
cs,Czech
cu,Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic
cv,Chuvash
cy,Welsh
da,Danish
de,German
dv,Divehi; Dhivehi; Maldivian
dz,Dzongkha
ee,Ewe
el,"Greek, Modern (1453-)"
en,English
eo,Esperanto
es,Spanish; Castilian
et,Estonian
eu,Basque
fa,Persian
ff,Fulah
fi,Finnish
fj,Fijian
fo,Faroese
fr,French
fy,Western Frisian
ga,Irish
gd,Gaelic; Scottish Gaelic
gl,Galician
gn,Guarani
gu,Gujarati
gv,Manx
ha,Hausa
he,Hebrew
hi,Hindi
ho,Hiri Motu
hr,Croatian
ht,Haitian; Haitian Creole
hu,Hungarian
hy,Armenian
hz,Herero
ia,Interlingua (International Auxiliary Language Association)
id,Indonesian
ie,Interlingue; Occidental
ig,Igbo
ii,Sichuan Yi; Nuosu
ik,Inupiaq
io,Ido
is,Icelandic
it,Italian
iu,Inuktitut
ja,Japanese
jv,Javanese
ka,Georgian
kg,Kongo
ki,Kikuyu; Gikuyu
kj,Kuanyama; Kwanyama
kk,Kazakh
kl,Kalaallisut; Greenlandic
km,Central Khmer
kn,Kannada
ko,Korean
kr,Kanuri
ks,Kashmiri
ku,Kurdish
kv,Komi
kw,Cornish
ky,Kirghiz; Kyrgyz
la,Latin
lb,Luxembourgish; Letzeburgesch
lg,Ganda
li,Limburgan; Limburger; Limburgish
ln,Lingala
lo,Lao
lt,Lithuanian
lu,Luba-Katanga
lv,Latvian
mg,Malagasy
mh,Marshallese
mi,Maori
mk,Macedonian
ml,Malayalam
mn,Mongolian
mr,Marathi
ms,Malay
mt,Maltese
my,Burmese
na,Nauru
nb,"Bokmål, Norwegian; Norwegian Bokmål"
nd,"Ndebele, North; North Ndebele"
ne,Nepali
ng,Ndonga
nl,Dutch; Flemish
nn,"Norwegian Nynorsk; Nynorsk, Norwegian"
no,Norwegian
nr,"Ndebele, South; South Ndebele"
nv,Navajo; Navaho
ny,Chichewa; Chewa; Nyanja
oc,Occitan (post 1500)
oj,Ojibwa
om,Oromo
or,Oriya
os,Ossetian; Ossetic
pa,Panjabi; Punjabi
pi,Pali
pl,Polish
ps,Pushto; Pashto
pt,Portuguese
qu,Quechua
rm,Romansh
rn,Rundi
ro,Romanian; Moldavian; Moldovan
ru,Russian
rw,Kinyarwanda
sa,Sanskrit
sc,Sardinian
sd,Sindhi
se,Northern Sami
sg,Sango
si,Sinhala; Sinhalese
sk,Slovak
sl,Slovenian
sm,Samoan
sn,Shona
so,Somali
sq,Albanian
sr,Serbian
ss,Swati
st,"Sotho, Southern"
su,Sundanese
sv,Swedish
sw,Swahili
ta,Tamil
te,Telugu
tg,Tajik
th,Thai
ti,Tigrinya
tk,Turkmen
tl,Tagalog
tn,Tswana
to,Tonga (Tonga Islands)
tr,Turkish
ts,Tsonga
tt,Tatar
tw,Twi
ty,Tahitian
ug,Uighur; Uyghur
uk,Ukrainian
ur,Urdu
uz,Uzbek
ve,Venda
vi,Vietnamese
vo,Volapük
wa,Walloon
wo,Wolof
xh,Xhosa
yi,Yiddish
yo,Yoruba
za,Zhuang; Chuang
zh,Chinese
zu,Zulu
"""
nlp2 = spacy.load("en_core_web_sm")
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])


Entities [('Haitian Creole', 'PERSON'), ('Sichuan Yi', 'PERSON'), ('Luba-Katanga', 'PERSON'), ('Uyghur', 'PERSON')]
Entities [('Afar\nab', 'PERSON'), ('Abkhazian', 'NORP'), ('Avestan', 'GPE'), ('Afrikaans', 'NORP'), ('Akan', 'ORG'), ('Aragonese', 'NORP'), ('Arabic', 'LANGUAGE'), ('Assamese', 'NORP'), ('Avaric', 'GPE'), ('Aymara', 'GPE'), ('Azerbaijani', 'PERSON'), ('Bashkir', 'PERSON'), ('Belarusian', 'NORP'), ('Bulgarian', 'NORP'), ('Bihari', 'ORG'), ('Bislama', 'GPE'), ('Bambara', 'GPE'), ('Bengali', 'NORP'), ('Tibetan', 'NORP'), ('Breton', 'GPE'), ('Bosnian', 'NORP'), ('Catalan', 'NORP'), ('Valencian', 'ORG'), ('Chechen', 'NORP'), ('Chamorro', 'PERSON'), ('Corsican', 'NORP'), ('Cree', 'NORP'), ('Czech', 'NORP'), ('Church Slavic', 'PERSON'), ('Bulgarian', 'NORP'), ('Chuvash', 'PRODUCT'), ('Welsh\nda', 'PERSON'), ('Danish\nde', 'PERSON'), ('German', 'NORP'), ('Divehi', 'PERSON'), ('Dhivehi', 'GPE'), ('Maldivian', 'NORP'), ('Ewe', 'GPE'), ('Modern', 'ORG'), ('English', 'LANGUAGE'), ('E

In [24]:
nlp.to_disk("model")