In [85]:
import spacy

#Create config file

In [86]:
!python -m spacy init config config.cfg --lang en --pipeline ner


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [None]:
!pip install datasets



#Load Dataset

In [87]:
from datasets import load_dataset

# Load the CoNLL-03 dataset
dataset = load_dataset("conll2003")

# Check the dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


# Explore Data

In [88]:
train_data = dataset['train']
train_data[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

#Convert datasets to spacy data

In [None]:
from spacy.training.example import Example
from spacy.tokens import DocBin
from datasets import load_dataset

# Load a blank spaCy model
nlp = spacy.blank("en")

# Define the NER labels
ner_labels = {
    0: "O",    # No entity
    1: "PER",
    2: "ORG",
    3: "LOC",
    4: "MISC",
    # Add other labels if necessary
}

# Convert the CoNLL-style dataset to spaCy format
def convert_to_spacy_format(example):
    tokens = example['tokens']
    ner_tags = example['ner_tags']

    # Recreate the full text from tokens
    text = " ".join(tokens)
    doc = nlp.make_doc(text)
    entities = []

    # get correct character offsets
    char_idx = 0
    for token, tag in zip(tokens, ner_tags):
        start = text.find(token, char_idx)
        end = start + len(token)
        char_idx = end  # update for next token

        if tag != 0:
            label = ner_labels.get(tag, "UNKNOWN")
            entities.append((start, end, label))

    # Create character-based entity spans
    spans = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            spans.append(span)

    doc.ents = spans
    return Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in spans]})

# Load the dataset
train_data = dataset['train']

# Convert the dataset
spacy_train_data = [convert_to_spacy_format(example) for example in train_data]

# Optional: save to .spacy binary file for spaCy training
doc_bin = DocBin()
for example in spacy_train_data:
    doc_bin.add(example.reference)
doc_bin.to_disk("train.spacy")

# Test output
print(spacy_train_data[0].text)
print([(ent.text, ent.label_) for ent in spacy_train_data[0].reference.ents])

EU rejects German call to boycott British lamb .
[('EU', 'LOC'), ('German', 'UNKNOWN'), ('British', 'UNKNOWN')]


In [None]:
validation_data = dataset['validation']

# Convert the dataset to spaCy format
spacy_validation_data = [convert_to_spacy_format(example) for example in validation_data]

In [None]:
test_data = dataset['test']

# Convert the dataset to spaCy format
spacy_test_data = [convert_to_spacy_format(example) for example in test_data]

#Save datasets

In [None]:
def save_spacy_file(examples, filename):
    doc_bin = DocBin()
    for example in examples:
        doc_bin.add(example.reference)  # reference is the annotated Doc
    doc_bin.to_disk(filename)
    print(f"Saved {filename}")

# Save train, validation, and test data
save_spacy_file(spacy_train_data, "train.spacy")
save_spacy_file(spacy_validation_data, "dev.spacy")
save_spacy_file(spacy_test_data, "test.spacy")


Saved train.spacy
Saved dev.spacy
Saved test.spacy


#Training the model

In [82]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     47.05    0.00    0.00    0.00    0.00
  0     200         92.85   3283.77   63.95   67.19   61.01    0.64
  0     400        171.60   2060.99   76.86   78.53   75.25    0.77
  0     600        227.44   1882.72   80.25   82.12   78.46    0.80
  0     800        296.25   2005.03   84.24   85.54   82.98    0.84
  0    1000        294.57   2105.42   85.33   86.03   84.64    0.85
  1    1200        327.41   2030.03   86.86   88.00   85.75    0.87
  1    1400        443.31   1525.37   88.25   88.66   87.83    0.88
  1    1600        404.12   1721.07   88.28   89.04   87.54    0.88
  2    1800        476.88   1749.80   88.83   

#Evaluate the model on test data

In [83]:
!python -m spacy evaluate ./output/model-best ./test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   81.62 
NER R   84.05 
NER F   82.82 
SPEED   12923 

[1m

              P       R       F
UNKNOWN   78.14   87.65   82.63
PER       85.62   81.01   83.25
LOC       81.97   75.02   78.34
ORG       90.38   94.29   92.29
MISC      74.89   81.44   78.03



# Test the model on random sentence

In [None]:
nlp = spacy.load("./output/model-best")

# Test sentence
text = "U.N. official Ekeus heads for Baghdad."
doc = nlp(text)

# Print entities
print("Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)
