In [12]:
import spacy

from spacy.tokens import Span


nlp = spacy.load("en_core_web_sm")

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

doc_LER = nlp("""Calvert Cliffs 2
Auxiliary Feedwater Pump Inoperable due to Improper Reset of Trip Throttle Valve
Abstract: On August 10, 2021 it was determined that the 22 Auxiliary Feedwater (AFW) Pump trip throttle valve was not reset properly in March 2021. As a result, the 22 AFW Pump was determined to be inoperable for a period longer than allowed by the technical specification condition completion time. The 22 AFW Pump was subsequently reset properly. The cause of the improper reset of 22 AFW Pump trip throttle valve was due to inadequate procedural guidance. Actions were taken to update the procedure providing additional detail to ensure the trip throttle valve is reset properly.""")

ents_LER_bf = [(e.text, e.start_char, e.end_char, e.label_) for e in doc_LER.ents]
print('Before: \n', ents_LER_bf)

# Create a span for the new entity
LER_ent = Span(doc_LER, 0, 4, label="PLACE")
orig_ents = list(doc_LER.ents)

# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc_LER.set_ents([LER_ent], default="unmodified")

ents_LER_af = [(e.text, e.start, e.end, e.label_) for e in doc_LER.ents]
print('After: \n', ents_LER_af)

Before: 
 [('Calvert Cliffs', 0, 14, 'PERSON'), ('2', 15, 16, 'CARDINAL'), ('Feedwater Pump Inoperable', 27, 52, 'ORG'), ('Improper Reset of', 60, 77, 'FAC'), ('August 10, 2021', 111, 126, 'DATE'), ('22', 154, 156, 'CARDINAL'), ('Feedwater', 167, 176, 'PERSON'), ('March 2021', 234, 244, 'DATE'), ('22', 263, 265, 'CARDINAL'), ('22', 402, 404, 'CARDINAL'), ('22', 482, 484, 'CARDINAL')]
After: 
 [('Calvert Cliffs 2\n', 0, 4, 'PLACE'), ('Feedwater Pump Inoperable', 5, 8, 'ORG'), ('Improper Reset of', 10, 13, 'FAC'), ('August 10, 2021', 20, 24, 'DATE'), ('22', 29, 30, 'CARDINAL'), ('Feedwater', 31, 32, 'PERSON'), ('March 2021', 44, 46, 'DATE'), ('22', 52, 53, 'CARDINAL'), ('22', 75, 76, 'CARDINAL'), ('22', 90, 91, 'CARDINAL')]


In [14]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
spacy.displacy.render(doc, style="ent")

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [13]:
for ent in doc_LER.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
spacy.displacy.render(doc_LER, style="ent")

Calvert Cliffs 2
 0 17 PLACE
Feedwater Pump Inoperable 27 52 ORG
Improper Reset of 60 77 FAC
August 10, 2021 111 126 DATE
22 154 156 CARDINAL
Feedwater 167 176 PERSON
March 2021 234 244 DATE
22 263 265 CARDINAL
22 402 404 CARDINAL
22 482 484 CARDINAL


In [16]:
display(nlp.vocab)

<spacy.vocab.Vocab at 0x170227fc3a0>

In [3]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")
training_data = [
  ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING")]),
]
# the DocBin will store the example documents
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        ents.append(span)
    doc.ents = ents
    db.add(doc)
    print([(e.text, e.start, e.end, e.label_) for e in doc.ents])
    
db.to_disk("./train.spacy")

[('Tokyo Tower', 0, 2, 'BUILDING')]


In [15]:
!python -m spacy convert -C -c "ner" 10_3882021001_2003.conll .

[i] Auto-detected token-per-line NER format
[i] Grouping every 1 sentences into a document.
[!] To generate better training data, you may want to group sentences into
documents with `-n 10`.
[+] Generated output file (7 documents): 10_3882021001_2003.spacy


In [32]:
import spacy
import sys, os, shutil

path_train = r'.\ner_ler\data'
path_test = r'.\ner_ler\data_test'
path_corpus = r'.\ner_ler\corpus'

# Generating the training dataset
output = f'ler_training'
if os.path.exists(os.path.join(path_corpus,  f'{output}.spacy')) == False:
    path = path_train
    path_original = os.path.join(path, f'annotation')
    path_modified = os.path.join(path, f'annotation_modified')
    path_spacy    = os.path.join(path, f'annotation_spacy')
    
    if os.path.isdir(path_modified) == False:
        os.mkdir(path_modified)
    if os.path.isdir(path_spacy) == False:
        os.mkdir(path_spacy)

    # with open(os.path.join(path_modified, f'{output}.conll'), 'w', encoding='utf8') as ler:
    #     ler.write("-DOCSTART- -X- O O\n\n")
    for filename in os.listdir(path_original):
        with open(os.path.join(path_original, filename, 'chaut.conll'), 'r', encoding='utf8') as origin:
            text = origin.read()
        with open(os.path.join(path_modified, f'{output}.conll'), 'a', encoding='utf8') as ler:
            ler.write("-DOCSTART- -X- O O\n\n")
        with open(os.path.join(path_modified, f'{output}.conll'), 'a', encoding='utf8') as ler:
            ler.write(text)

    spacy.cli.convert(os.path.join(path_modified, f'{output}.conll'), os.path.join(path_spacy), converter='ner', file_type='spacy', concatenate=True)
    shutil.copyfile(os.path.join(path_spacy, f'{output}.spacy'), os.path.join(path_corpus, f'{output}.spacy'))

# Generating the testing dataset
output = f'ler_eval'
if os.path.exists(os.path.join(path_corpus,  f'{output}.spacy')) == False:
    path = path_test
    path_original = os.path.join(path, f'annotation')
    path_modified = os.path.join(path, f'annotation_modified')
    path_spacy    = os.path.join(path, f'annotation_spacy')
    
    if os.path.isdir(path_modified) == False:
        os.mkdir(path_modified)
    if os.path.isdir(path_spacy) == False:
        os.mkdir(path_spacy)

    
    for filename in os.listdir(path_original):
        with open(os.path.join(path_original, filename, 'rossella.conll'), 'r', encoding='utf8') as origin:
            text = origin.read()
        with open(os.path.join(path_modified, f'{output}.conll'), 'a', encoding='utf8') as ler:
            ler.write("-DOCSTART- -X- O O\n\n")
        with open(os.path.join(path_modified, f'{output}.conll'), 'a', encoding='utf8') as ler:
            ler.write(text)

    spacy.cli.convert(os.path.join(path_modified, f'{output}.conll'), os.path.join(path_spacy), converter='ner', file_type='spacy', concatenate=True)
    shutil.copyfile(os.path.join(path_spacy, f'{output}.spacy'), os.path.join(path_corpus, f'{output}.spacy'))

"Plant _ O O\n: _ O O\nLaSalle _ O O\n1 _ O O\n. _ O O\n\nSafety _ O O\nRelated _ O O\nElectrical _ O O\nBus _ O O\nUndervoltage _ O O\nResults _ O O\nin _ O O\nValid _ O O\nActuation _ O O\nof _ O O\nthe _ O O\nCommon _ O O\nEmergency _ O O\nDiesel _ O O\nGenerator _ O O\n. _ O O\n\nOn _ O O\nFebruary _ O O\n28 _ O O\n, _ O O\n2019 _ O O\n, _ O O\nwhile _ O O\nshut _ O O\ndown _ O O\nfor _ O O\na _ O O\nscheduled _ O O\nrefueling _ O O\noutage _ O O\nin _ O O\nMode _ O O\n5 _ O O\n, _ O O\nUnit _ O O\n2 _ O O\nDivision _ O O\n1 _ O O\nsafety-related _ O O\nelectrical _ O O\nbus _ O O\n( _ O O\n241Y _ O O\n) _ O O\ntripped _ O O\nduring _ O O\nsurveillance _ O O\ntesting _ O O\n. _ O O\n\nThe _ O O\npurpose _ O O\nof _ O O\nthe _ O O\nsurveillance _ O O\ntesting _ O O\nwas _ O O\nto _ O O\ndemonstrate _ O O\nthe _ O O\noperability _ O O\nof _ O O\nthe _ O O\nair _ O O\ncircuit _ O O\nbreakers _ O O\n( _ O O\nACBs _ O O\n) _ O O\nnecessary _ O O\nto _ O O\nprovide _ O O\nthe _ O O\nseco

In [3]:
import spacy

nlp = spacy.load("en_core_web_trf")
doc = nlp("Apple shares rose on the news. Apple pie is delicious.")

In [4]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
spacy.displacy.render(doc, style="ent")

Apple 0 5 ORG


In [2]:
import spacy
from transformers import *

# sample text from Wikipedia
text = """
Albert Einstein was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. 
Einstein is best known for developing the theory of relativity, but he also made important contributions to the development of the theory of quantum mechanics.
Einstein was born in the German Empire, but moved to Switzerland in 1895, forsaking his German citizenship (as a subject of the Kingdom of Württemberg) the following year. 
In 1897, at the age of 17, he enrolled in the mathematics and physics teaching diploma program at the Swiss Federal polytechnic school in Zürich, graduating in 1900
"""
# load BERT model fine-tuned for Named Entity Recognition (NER)
ner = pipeline("ner", model="dslim/bert-base-NER")

# perform inference on the transformer model
doc_ner = ner(text)
# print the output
doc_ner
for ent in doc_ner.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
spacy.displacy.render(doc_ner, style="ent")

loading configuration file https://huggingface.co/dslim/bert-base-NER/resolve/main/config.json from cache at C:\Users\chaut/.cache\huggingface\transformers\a5ff16a1d557b5ad480f50b1d454448475c644d08df9ce8fccabea7745bebd9f.a61836f2236a3ff1a0827544e2d7c512cbb8cd26ed7b32d643526bebb5d7f92e
Model config BertConfig {
  "_name_or_path": "dslim/bert-base-NER",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e

AttributeError: 'list' object has no attribute 'ents'