In [1]:
# Spacy
!python -m spacy init config config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [3]:
nlp = spacy.blank("en")

In [4]:
# Spacy training_data for DocBin
training_data = [
    ("This is a text about Apple Inc.", {"entities": [(21, 31, "ORG")]}),
    ("I like to eat pizza and drink Coke.", {"entities": [(14, 19, "FOOD"), (30, 34, "DRINK")]}),
    ("The weather in Cairo is hot.", {"entities": [(15, 20, "GPE")]})
]


In [6]:
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    ents = []
    for annotation in annotations.values():
        start, end, label = annotation[0]
        span = doc.char_span(start, end, label=label, alignment_mode="strict")
        ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./train.spacy")

In [7]:
# I will build the dev set on the training data to identify the ideal case of training
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    ents = []
    for annotation in annotations.values():
        start, end, label = annotation[0]
        span = doc.char_span(start, end, label=label, alignment_mode="strict")
        ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./dev.spacy")

In [12]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output ./output

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     14.36    0.00    0.00    0.00    0.00
200     200          1.09    206.43  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100

In [31]:
nlp = spacy.load("./output/model-best")
# This sentence has no meaning just to check if model trained
doc = nlp("The weather in Cairo is hot and Apple Inc is hot pizza and drink Coke.")
print(doc.ents)

(Cairo, Apple Inc, pizza)


In [33]:
!python -m spacy evaluate ./output/model-best dev.spacy --output metrics.json

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   100.00
NER R   100.00
NER F   100.00
SPEED   5609  

[1m

            P        R        F
ORG    100.00   100.00   100.00
FOOD   100.00   100.00   100.00
GPE    100.00   100.00   100.00

[38;5;2m✔ Saved results to metrics.json[0m
