In [1]:
import ast, random, spacy
import pandas as pd
from pathlib import Path
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
# Scorpio

In [5]:
CSV_PATH = "/content/NER_Dataset.csv"
df = pd.read_csv(CSV_PATH, encoding="latin-1")

df["tokens"] = df["Word"].apply(ast.literal_eval)
df["tags"]   = df["Tag"].apply(ast.literal_eval)

In [6]:
nlp  = spacy.blank("en")
ner  = nlp.add_pipe("ner")

labels = sorted({t.split("-",1)[1] for row in df.tags for t in row if t != "O"})
for lab in labels:
    ner.add_label(lab)

In [7]:
def row_to_example(tokens, tags):
    text, offset, spans = "", 0, []
    for tok, tag in zip(tokens, tags):
        if text:
            text += " "
            offset += 1
        start = offset
        text += tok
        end = offset + len(tok)
        if tag != "O":
            spans.append((start, end, tag.split("-",1)[1]))
        offset = end
    doc = nlp.make_doc(text)
    doc.ents = [doc.char_span(s, e, label=l, alignment_mode="contract")
                for s,e,l in spans if doc.char_span(s, e, label=l)]
    return Example.from_dict(doc, {"entities": spans})

examples = [row_to_example(toks, tags)
            for toks, tags in zip(df.tokens, df.tags)]

In [8]:
train_exs, tmp = train_test_split(examples, test_size=0.2,
                                      random_state=42, shuffle=True)
dev_exs, test_exs  = train_test_split(tmp, test_size=0.5,
                                      random_state=42, shuffle=True)

In [9]:
for name, exs in [("train", train_exs), ("dev", dev_exs), ("test", test_exs)]:
    db = DocBin(store_user_data=True)
    for ex in exs:
        db.add(ex.reference)
    db.to_disk(f"{name}.spacy")

In [None]:
optimizer = nlp.initialize(get_examples=lambda: train_exs)
N_EPOCHS  = 37

for epoch in range(1, N_EPOCHS + 1):
    random.shuffle(train_exs)
    losses = {}

    for batch in minibatch(train_exs, size=compounding(4.0, 32.0, 1.5)):
        nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)

    dev_examples = [Example(nlp(ex.reference.text), ex.reference)
                    for ex in dev_exs]
    dev_scores = nlp.evaluate(dev_examples)

    print(
        f"epoch {epoch:02d}  "
        f"loss={losses['ner']:.3f}  "
        f"P={dev_scores['ents_p']:.2f}  "
        f"R={dev_scores['ents_r']:.2f}  "
        f"F1={dev_scores['ents_f']:.2f}"
    )

epoch 01  loss=61284.378  P=0.85  R=0.84  F1=0.85
epoch 02  loss=40865.917  P=0.86  R=0.86  F1=0.86
epoch 03  loss=36770.088  P=0.87  R=0.85  F1=0.86
epoch 04  loss=34194.790  P=0.87  R=0.86  F1=0.87
epoch 05  loss=32499.618  P=0.87  R=0.86  F1=0.86
epoch 06  loss=30931.744  P=0.87  R=0.85  F1=0.86
epoch 07  loss=29772.791  P=0.88  R=0.86  F1=0.87
epoch 08  loss=28540.543  P=0.88  R=0.86  F1=0.87
epoch 09  loss=27717.044  P=0.87  R=0.87  F1=0.87
epoch 10  loss=26981.188  P=0.87  R=0.88  F1=0.87
epoch 11  loss=26049.764  P=0.88  R=0.87  F1=0.87
epoch 12  loss=25342.150  P=0.88  R=0.87  F1=0.88
epoch 13  loss=25053.035  P=0.88  R=0.87  F1=0.88
epoch 14  loss=24272.777  P=0.88  R=0.87  F1=0.87
epoch 15  loss=24019.191  P=0.88  R=0.87  F1=0.88
epoch 16  loss=23758.517  P=0.88  R=0.86  F1=0.87
epoch 17  loss=22932.794  P=0.88  R=0.87  F1=0.87
epoch 18  loss=22706.040  P=0.87  R=0.87  F1=0.87
epoch 19  loss=22285.696  P=0.88  R=0.87  F1=0.87
epoch 20  loss=22076.073  P=0.88  R=0.87  F1=0.87


In [None]:
test_score = nlp.evaluate([Example(nlp(ex.reference.text), ex.reference)
                           for ex in test_exs])
print("TEST  F1={ents_f:.2f}  P={ents_p:.2f}  R={ents_r:.2f}".format(**test_score))

TEST  F1=0.87  P=0.87  R=0.88


In [None]:
out_dir = Path("best_ner_model")
nlp.to_disk(out_dir)
print(f"model saved to {out_dir.resolve()}")

model saved to /content/best_ner_model


In [10]:
# Creates a small, human-editable template
!python -m spacy init config base_config.cfg \
       --lang en \
       --pipeline ner \
       --optimize efficiency


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python -m spacy train config.cfg \
     --output ./output \
     --paths.train ./train.spacy \
     --paths.dev   ./dev.spacy \
     --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     51.41    0.10    2.09    0.05    0.00
  0     200        105.37   3260.95   63.99   66.04   62.05    0.64
  0     400        277.54   2150.17   76.78   77.47   76.10    0.77
  0     600        198.07   2175.77   78.36   79.13   77.61    0.78
  0     800        253.81   2575.24   81.39   83.06   79.79    0.81
  0    1000        277.44   2795.69   82.23   84.02   80.52    0.82
  0    1200        324.41   3161.00   82.74   84.02   81.51    0.83
  0    1400        393.23   3683.37   83.03   84.02   82.06    0.83
  0    1600        454.40   4304.35   84.58   86.01   83.20    

In [13]:
!python -m spacy evaluate output/model-best ./test.spacy --output metrics.json --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   88.66 
NER R   86.43 
NER F   87.53 
SPEED   33856 

[1m

          P       R       F
gpe   96.29   93.59   94.92
geo   86.94   91.35   89.09
tim   94.78   87.85   91.19
per   88.24   89.92   89.07
org   84.09   76.81   80.28
eve   65.22   25.86   37.04
nat   60.00   33.33   42.86
art    0.00    0.00    0.00

[38;5;2m✔ Saved results to metrics.json[0m


In [14]:
ld = spacy.load("./output/model-best")

In [17]:
text = "Elon Musk founded Tesla"
doc = ld(text)


for entity in doc.ents:
    print(entity.text, entity.label_)

Elon per
Musk per
Tesla geo
