In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [3]:
import spacy
from spacy.tokens import DocBin
from datasets import load_dataset
import transformers

In [None]:
dataset = load_dataset("conll2003")
nlp = spacy.blank("en")
doc_bin = DocBin()

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
train_examples = dataset["train"]['tokens']
train_tags = dataset["train"]['ner_tags']

In [7]:
print(train_examples[0])
print(train_tags[0])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


In [8]:
labels = dataset["train"].features["ner_tags"].feature.names

In [9]:
for tokens, ner_tags in zip(dataset["train"]["tokens"], dataset["train"]["ner_tags"]):
    doc = nlp.make_doc(" ".join(tokens))
    ents = []
    start = 0
    for i, (word, tag_id) in enumerate(zip(tokens, ner_tags)):
        tag = labels[tag_id]
        if tag == "O":
            start += len(word) + 1
            continue
        if tag.startswith("B-"):
            ent_start = start
            ent_end = start + len(word)
            ent_label = tag[2:]

            for j in range(i+1, len(tokens)):
                next_tag = labels[ner_tags[j]]
                if next_tag == f"I-{ent_label}":
                    ent_end += len(tokens[j]) + 1
                else:
                    break
            ents.append((ent_start, ent_end - 1, ent_label))
        start += len(word) + 1

    doc.ents = [doc.char_span(start, end + 1, label=label) for start, end, label in ents if doc.char_span(start, end + 1, label=label)]
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

In [10]:
for tokens, ner_tags in zip(dataset["validation"]["tokens"], dataset["validation"]["ner_tags"]):
    doc = nlp.make_doc(" ".join(tokens))
    ents = []
    start = 0
    for i, (word, tag_id) in enumerate(zip(tokens, ner_tags)):
        tag = labels[tag_id]
        if tag == "O":
            start += len(word) + 1
            continue
        if tag.startswith("B-"):
            ent_start = start
            ent_end = start + len(word)
            ent_label = tag[2:]

            for j in range(i+1, len(tokens)):
                next_tag = labels[ner_tags[j]]
                if next_tag == f"I-{ent_label}":
                    ent_end += len(tokens[j]) + 1
                else:
                    break
            ents.append((ent_start, ent_end - 1, ent_label))
        start += len(word) + 1

    doc.ents = [doc.char_span(start, end + 1, label=label) for start, end, label in ents if doc.char_span(start, end + 1, label=label)]
    doc_bin.add(doc)

doc_bin.to_disk("dev.spacy")

In [11]:
!python -m spacy init config config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [15]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --training.max_epochs 10 --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.28    0.00    0.00    0.00    0.00
  0     200        156.78   3000.96   50.66   52.03   49.36    0.51
  0     400        507.47   2122.70   69.54   72.37   66.91    0.70
  0     600        347.62   1974.91   76.62   77.75   75.52    0.77
  0     800        348.17   2083.60   82.65   84.95   80.47    0.83
  0    1000        435.35   2261.76   85.81   85.69   85.94    0.86
  1    1200        467.04   2165.55   88.77   89.39   88.17    0.89
  1    1400        505.94   1907.21   89.55   89.04   90.07    0.90
  1    1600        651.82   2187.49   92.45   92.77   92.13    0.92
  2    1800        695.09   2060.17   93.14

In [16]:
for tokens, ner_tags in zip(dataset["test"]["tokens"], dataset["test"]["ner_tags"]):
    doc = nlp.make_doc(" ".join(tokens))
    ents = []
    start = 0
    for i, (word, tag_id) in enumerate(zip(tokens, ner_tags)):
        tag = labels[tag_id]
        if tag == "O":
            start += len(word) + 1
            continue
        if tag.startswith("B-"):
            ent_start = start
            ent_end = start + len(word)
            ent_label = tag[2:]

            for j in range(i+1, len(tokens)):
                next_tag = labels[ner_tags[j]]
                if next_tag == f"I-{ent_label}":
                    ent_end += len(tokens[j]) + 1
                else:
                    break
            ents.append((ent_start, ent_end - 1, ent_label))
        start += len(word) + 1

    doc.ents = [doc.char_span(start, end + 1, label=label) for start, end, label in ents if doc.char_span(start, end + 1, label=label)]
    doc_bin.add(doc)

doc_bin.to_disk("test.spacy")

In [17]:
!python -m spacy evaluate output/model-best ./test.spacy --output metrics.json

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   92.38 
NER R   92.28 
NER F   92.33 
SPEED   25971 

[1m

           P       R       F
ORG    88.22   90.98   89.58
MISC   91.94   90.27   91.10
PER    94.02   92.11   93.06
LOC    94.89   94.53   94.71

[38;5;2m✔ Saved results to metrics.json[0m
