# Documentation
https://spacy.io/usage/training

In [1]:
#!pip install spacy
#!pip install datasets


**load dataset**

In [2]:
from datasets import load_dataset
dataset = load_dataset("conll2003")
print(dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


**Map Tag IDs to Entity Labels**

In [3]:
label_list = dataset['train'].features['ner_tags'].feature.names
# Example: label_list[3] = 'B-ORG'


# Convert CoNLL2003 to spaCy Format

In [4]:
def conll_to_spacy(dataset_split):
    data = []
    for item in dataset_split:
        tokens = item["tokens"]
        tags = item["ner_tags"]
        text = " ".join(tokens)
        entities = []
        offset = 0

        for token, tag in zip(tokens, tags):
            tag_name = label_list[tag]
            token_start = text.find(token, offset)
            token_end = token_start + len(token)
            offset = token_end

            if tag_name.startswith("B-"):
                entity_label = tag_name[2:]
                entities.append((token_start, token_end, entity_label))

        data.append((text, {"entities": entities}))
    return data

TRAIN_DATA = conll_to_spacy(dataset["train"].select(range(3000)))  # Use first 3K samples
DEV_DATA = conll_to_spacy(dataset["validation"])


## **Set Up and Train spaCy NER Model**

In [5]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add entity labels
for _, annotations in TRAIN_DATA:
    for start, end, label in annotations["entities"]:
        ner.add_label(label)

# Train the model
with nlp.disable_pipes([pipe for pipe in nlp.pipe_names if pipe != "ner"]):
    optimizer = nlp.begin_training()
    for i in range(10):  # 10 epochs
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], sgd=optimizer, drop=0.5, losses=losses)
        print(f"Epoch {i+1}: Losses {losses}")


Epoch 1: Losses {'ner': np.float32(5687.4263)}
Epoch 2: Losses {'ner': np.float32(3819.66)}
Epoch 3: Losses {'ner': np.float32(2961.7898)}
Epoch 4: Losses {'ner': np.float32(2471.3584)}
Epoch 5: Losses {'ner': np.float32(2114.7102)}
Epoch 6: Losses {'ner': np.float32(1834.8018)}
Epoch 7: Losses {'ner': np.float32(1594.2517)}
Epoch 8: Losses {'ner': np.float32(1483.9362)}
Epoch 9: Losses {'ner': np.float32(1364.948)}
Epoch 10: Losses {'ner': np.float32(1241.9075)}


**Evaluate the Model**

In [6]:
# Quick evaluation on dev data
correct = 0
total = 0

for text, annotations in DEV_DATA[:100]:
    doc = nlp(text)
    predicted = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)
    actual = set(annotations["entities"])
    correct += len(predicted & actual)
    total += len(actual)

print(f"Accuracy on 100 dev samples: {correct/total:.2%}")


Accuracy on 100 dev samples: 77.27%


**Save and Use the Model**

In [7]:
# Save model
nlp.to_disk("conll2003_ner_model")
# Load and test
nlp2 = spacy.load("conll2003_ner_model")
doc = nlp2("Microsoft announced new plans in Paris.")
for ent in doc.ents:
    print(ent.text, ent.label_)


Microsoft LOC
Paris LOC
