In [1]:
from datasets import load_dataset
data = load_dataset("eriktks/conll2003", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sample = data['train'][0]

In [3]:
label_map = {
    0: "O",       
    1: "B-PER",   
    2: "I-PER",   
    3: "B-ORG",  
    4: "I-ORG",   
    5: "B-LOC",  
    6: "I-LOC",   
    7: "B-MISC",  
    8: "I-MISC", 
}


In [4]:
sample

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
import spacy
from spacy.tokens import DocBin
from spacy.training.iob_utils import biluo_tags_to_offsets
from spacy.training.iob_utils import iob_to_biluo
def convert_entry(entry, label_map, nlp):
    tokens = entry['tokens']
    ner_tags = [label_map.get(tag, 'O') for tag in entry['ner_tags']]
    text = " ".join(tokens)
    doc = nlp(text)
    biluo_tags = iob_to_biluo(ner_tags)
    spans = biluo_tags_to_offsets(doc, biluo_tags)
    entities = [doc.char_span(start, end, label=label) for start, end, label in spans if doc.char_span(start, end, label=label)]
    doc.ents = entities
    return doc

# Main conversion
def convert_dataset(dataset, label_map, output_path):
    nlp = spacy.blank("en")
    db = DocBin()
    for entry in dataset:
        doc = convert_entry(entry, label_map, nlp)
        db.add(doc)
    db.to_disk(output_path)


In [6]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:
nlp = spacy.blank("en")
doc = convert_entry(sample, label_map, nlp)
doc.ents
doc.text

'EU rejects German call to boycott British lamb .'

In [8]:
convert_dataset(data['train'], label_map, "train.spacy")
convert_dataset(data['test'], label_map, "test.spacy")
convert_dataset(data['validation'], label_map, "validation.spacy")

In [9]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./validation.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.28    0.00    0.00    0.00    0.00
  0     200        613.24   3547.70   35.52   43.82   29.86    0.36
  0     400        243.50   2935.96   52.03   54.26   49.98    0.52
  0     600        286.67   2865.47   63.14   64.85   61.51    0.63
  0     800        374.02   2995.49   67.54   69.03   66.12    0.68
  0    1000        550.72   3641.20   69.48   71.55   67.54    0.69
  1    1200        642.22   3773.55   72.97   74.70   71.32    0.73
  1    1400        776.95   3766.72   71.84   73.79   69.99    0.72
  1    1600        984.89   4565.72   74.45   76.56   72.45    0.74
  2    1800       1337.73   4864.46   74.95

In [10]:
!python -m spacy evaluate output/model-best test.spacy --output metrics.json

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   73.61 
NER R   70.41 
NER F   71.98 
SPEED   16058 

[1m

           P       R       F
LOC    75.06   77.04   76.04
PER    74.24   70.75   72.45
ORG    72.48   67.07   69.67
MISC   70.80   61.82   66.01

[38;5;2m✔ Saved results to metrics.json[0m


In [11]:
import json
with open("metrics.json", "r") as f:
    metrics = json.load(f)
print(metrics["ents_p"], metrics["ents_r"], metrics["ents_f"])


0.7360725523 0.7041430595 0.7197538684
