In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm") 

In [3]:
sentence = "In the week before the departure to Sacramento, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul."
sentence #Sacramento since Arrakis (Fictional Place) was not captured as place

'In the week before the departure to Sacramento, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul.'

In [4]:
doc = nlp(sentence)
doc.ents

(the week, Sacramento, Paul)

In [5]:
for entity in doc.ents:
    print(entity.text, entity.label_)

the week DATE
Sacramento GPE
Paul PERSON


In [6]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [7]:
#Using NER annotation tool, prepared the same sentence, but with "Arrakis" instead of Sacramento

In [9]:
from spacy.tokens import DocBin
from tqdm import tqdm
import json

fict_data = json.load(open("C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/data/ner/annotations.json",'r'))
len(fict_data)
fict_data[0]

['In the week before the departure to Arrakis, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul.\r',
 {'entities': [[0, 11, 'DATE']]}]

In [10]:
#Base config file can be downloaded from the documentation: https://spacy.io/usage/training
#Either auto-fill can be done on the same or can be filled manually. 

In [11]:
#Command to autofill.

In [12]:
!python -m spacy init fill-config ner_config.cfg config.cfg 

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [13]:
#Using DocBin object from spacy to load our newly annotated data
#tqdm aids in creating a progress bar to loops/iterables.

In [14]:
from tqdm import tqdm
for text, ents in tqdm([['Diagon Alley has a variety of things to choose and buy from, Hogsmeade is very cosy. Overall, Hogwarts must be one cheery of an experience.\r',
  {'entities': [[0, 12, 'PLACE'], [61, 70, 'PLACE'], [94, 102, 'PLACE']]}]]):
    n = ents['entities']
    print(n)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]

[[0, 12, 'PLACE'], [61, 70, 'PLACE'], [94, 102, 'PLACE']]





In [15]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()
  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']
    ents = []
    entity_indices = []
    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue
      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue
      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)
    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass
  return db

In [18]:
fict_data

[['In the week before the departure to Arrakis, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul.\r',
  {'entities': [[0, 11, 'DATE']]}],
 ['Diagon Alley has a variety of things to choose and buy from, Hogsmeade is very cosy. Overall, Hogwarts must be one cheery of an experience.\r',
  {'entities': [[0, 12, 'PLACE'], [61, 70, 'PLACE'], [94, 102, 'PLACE']]}],
 ["Scythia had become a labyrinth of snow and ice. I'd been monitoring the parameters of the thicket for an hour, and my vantage point in the crook of a tree branch had turned useless.\r",
  {'entities': [[0, 7, 'PLACE']]}]]

In [20]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(fict_data, test_size=0.2)
print(len(train), len(test))
file = open("C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/train_file.txt",'w')
db = get_spacy_doc(file, train)
db.to_disk("C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/train_data.spacy")
db = get_spacy_doc(file, test)
db.to_disk("C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/test_data.spacy")
file.close()

2 1


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 429.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


In [22]:
!python -m spacy train "C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/config.cfg"  --output "C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/output"  --paths.train "C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/train_data.spacy"  --paths.dev "C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/test_data.spacy"

[i] Saving to output directory: C:\Users\kamalam.s\OneDrive -
Kryptos\Desktop\kamalam's\nlp dev\trained_models\output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     30.30    0.00    0.00    0.00    0.00
200     200         12.55    489.64    0.00    0.00    0.00    0.00
400     400          0.00      0.00    0.00    0.00    0.00    0.00
600     600          0.00      0.00    0.00    0.00    0.00    0.00
800     800          0.00      0.00    0.00    0.00    0.00    0.00
1000    1000          0.00      0.00    0.00    0.00    0.00    0.00
1200    1200          0.00      0.00    0.00    0.00    0.00    0.00
1400    1400          0.00      0.00    0.00    0.00    0.00    0.00
1600    1600          0.00      0.00    0.00    0.00    0.00    0.00
[+] Saved pipeline

In [42]:
nlpt = spacy.load("C:/Users/kamalam.s/OneDrive - Kryptos/Desktop/kamalam's/nlp dev/trained_models/output/model-best")
sample = "Diagon Alley, is a fictional place from Rowling's book, written two weeks earlier."

doct = nlpt(sample)

In [43]:
for ent in doct.ents:
  print(ent.text, ent.label_)

Diagon Alley PLACE


In [44]:
spacy.displacy.render(doct, style="ent", jupyter=True) # display in Jupyter

In [45]:
ner_tagged = [(word.text, word.ent_type_) for word in doct] 

named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag:
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None

In [46]:
named_entities

[('Diagon Alley', 'PLACE')]