In [50]:
import spacy

In [51]:
nlp = spacy.load("en_core_web_sm") 

In [52]:
sentence = "In the week before the departure to Sacramento, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul."
sentence #Sacramento since Arrakis (Fictional Place) was not captured as place

'In the week before the departure to Sacramento, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul.'

In [53]:
doc = nlp(sentence)
doc.ents

(the week, Sacramento, Paul)

In [54]:
for entity in doc.ents:
    print(entity.text, entity.label_)

the week DATE
Sacramento GPE
Paul PERSON


In [55]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [56]:
#Using NER annotation tool, prepared the same sentence, but with "Arrakis" instead of Sacramento

In [57]:
from spacy.tokens import DocBin
from tqdm import tqdm
import json

fict_data = json.load(open("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/data/ner/annotations.json",'r'))
len(fict_data)
fict_data[0]

['In the week before the departure to Arrakis, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul.\r',
 {'entities': [[0, 11, 'DATE']]}]

In [58]:
#Base config file can be downloaded from the documentation: https://spacy.io/usage/training
#Either auto-fill can be done on the same or can be filled manually. 

In [74]:
#Command to autofill.

In [108]:
!python -m spacy init fill-config ner_config.cfg config.cfg 

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [60]:
#Using DocBin object from spacy to load our newly annotated data
#tqdm aids in creating a progress bar to loops/iterables.

In [61]:
from tqdm import tqdm
for text, ents in tqdm([['Diagon Alley has a variety of things to choose and buy from, Hogsmeade is very cosy. Overall, Hogwarts must be one cheery of an experience.\r',
  {'entities': [[0, 12, 'PLACE'], [61, 70, 'PLACE'], [94, 102, 'PLACE']]}]]):
    n = ents['entities']
    print(n)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]

[[0, 12, 'PLACE'], [61, 70, 'PLACE'], [94, 102, 'PLACE']]





In [62]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()
  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']
    ents = []
    entity_indices = []
    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue
      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue
      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)
    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass
  return db

In [63]:
train

[['Diagon Alley has a variety of things to choose and buy from, Hogsmeade is very cosy. Overall, Hogwarts must be one cheery of an experience.\r',
  {'entities': [[0, 12, 'PLACE'], [61, 70, 'PLACE'], [94, 102, 'PLACE']]}],
 ["Scythia had become a labyrinth of snow and ice. I'd been monitoring the parameters of the thicket for an hour, and my vantage point in the crook of a tree branch had turned useless.\r",
  {'entities': [[0, 7, 'PLACE']]}]]

In [66]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(fict_data, test_size=0.2)
print(len(train), len(test))
file = open("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/train_file.txt",'w')
db = get_spacy_doc(file, train)
db.to_disk("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/train_data.spacy")
db = get_spacy_doc(file, test)
db.to_disk("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/test_data.spacy")
file.close()

2 1


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 196.99it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 175.91it/s]


In [73]:
!python -m spacy train "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/config.cfg"  --output "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/output"  --paths.train "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/train_data.spacy"  --paths.dev "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/test_data.spacy"

[i] Saving to output directory: C:\Users\kamalam.s\Desktop\kamalam's\nlp
dev\trained_models\output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     33.40    0.00    0.00    0.00    0.00
200     200          0.48    417.62    0.00    0.00    0.00    0.00
400     400          0.00      0.00    0.00    0.00    0.00    0.00
600     600          0.00      0.00    0.00    0.00    0.00    0.00
800     800          0.00      0.00    0.00    0.00    0.00    0.00
1000    1000          0.00      0.00    0.00    0.00    0.00    0.00
1200    1200          0.00      0.00    0.00    0.00    0.00    0.00
1400    1400          0.00      0.00    0.00    0.00    0.00    0.00
1600    1600          0.00      0.00    0.00    0.00    0.00    0.00
[+] Saved pipeline to output director

In [91]:
nlpt = spacy.load("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/trained_models/output/model-best")
sample = "Freya is from Azkaban."

doct = nlpt(sample)

In [92]:
for ent in doct.ents:
  print(ent.text, ent.label_)

Freya PLACE


In [97]:
#Trying to perform a better custom ner

In [96]:
data = json.load(open("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/custom_trial.json",'r'))
len(data)
data[0]

['2021-07-08 13:45:57 INFO WMQJCAResourceAdapter: 0] =>>>GenericMessageDrivenBean - start() <Envelope><Header><CommonInfo><Source>OTP</Source><Service>SMSSrv</Service></CommonInfo>\r\n</Header><Body><SMSSrv><Mobile><CountryCode>61</CountryCode><Num>0420742903</Num>\r\n</Mobile><Source>IBNK</Source><RefNum>6266342</RefNum><RefNum2>1</RefNum2>\r\n<Bank>STG</Bank><GCISCustNum>11181855</GCISCustNum><SMSMsg>Your Secure Code is 494244 to add\r\nPayee with BB:062-013 Account:424242454. Never share this code.</SMSMsg></SMSSrv></Body></Envelope>\r\n(au.com.stgeorge.sms.integration.impl.listener.GenericMessageDrivenBean)\r\n',
 {'entities': [[418, 424, 'SOURCECODE']]}]

In [98]:
!python -m spacy init fill-config ner_config.cfg "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/config.cfg"

[+] Auto-filled config with all values
[+] Saved config
C:\Users\kamalam.s\Desktop\kamalam's\nlp dev\custom_trial\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [99]:
train, test = train_test_split(fict_data, test_size=0.2)
print(len(train), len(test))
file = open("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/train_file.txt",'w')
db = get_spacy_doc(file, train)
db.to_disk("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/train_data.spacy")
db = get_spacy_doc(file, test)
db.to_disk("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/test_data.spacy")
file.close()

2 1


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 82.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 302.38it/s]


In [102]:
!python -m spacy train "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/config.cfg"  --output "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/output"  --paths.train "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/train_data.spacy"  --paths.dev "C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/test_data.spacy"

[+] Created output directory: C:\Users\kamalam.s\Desktop\kamalam's\nlp
dev\custom_trial\output
[i] Saving to output directory: C:\Users\kamalam.s\Desktop\kamalam's\nlp
dev\custom_trial\output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     30.30    0.00    0.00    0.00    0.00
200     200          2.00    432.13    0.00    0.00    0.00    0.00
400     400          0.00      0.00    0.00    0.00    0.00    0.00
600     600          0.00      0.00    0.00    0.00    0.00    0.00
800     800          0.00      0.00    0.00    0.00    0.00    0.00
1000    1000          0.00      0.00    0.00    0.00    0.00    0.00
1200    1200          0.00      0.00    0.00    0.00    0.00    0.00
1400    1400          0.00      0.00    0.00    0.00    0.00    0.00
1600    1600 

In [103]:
nlpt = spacy.load("C:/Users/kamalam.s/Desktop/kamalam's/nlp dev/custom_trial/output/model-best")
sample = "\r\n2021-07-02 14:57:54:456 INFO [WMQJCAResourceAdapter: 4] =>srvName :AlertTriggerSrv request :<Envelope>\r\n<Header><Commoninfo><Source>ODS</Source><SourceBranch><Num>0</Num></SourceBranch>\r\n<Service>AlertTriggerSrv</Service></CommonInfo></Header><Body><AlertTriggerSr>\r\n<Applid>CRA</Applid><AcctNum><AcctKey>4601841000280711</AcctKey></AcctNum>\r\n<AvailBal>5599.20</AvailBal><CurrBal>9348.30</CurrBal><ODLim>0.00</ODLim>\r\n<FloatAmt>0.00</FloatAmt><ShortName>Priyesh Dave</ShortName><TransType>AU</TransType><TranEffDate>\r\n<Day>02</Day><Month>07</Month><Year>2021</Year></TranEffDate><TranEffTime><Hour>14</Hour>\r\n<Min>57</Min><Sec>54</Sec></TranEffTime><TranAmt>52.50</TranAmt><TranDesc1>TRML= W1364081\r\n20210702</TranDesc1><TranDesc2>WOOLWORTHS/BEECROFTRDBEECROFTAU</TranDesc2>\r\n<SchedulePymtind>N</SchedulePymtInd><Brand></Brand></AlertTriggerSrv></Body></Envelope>response:<Envelope><Body><AlertTriggerSrvResp><Code><Code>000</Code><Desc>The request was\r\nsuccessfully performed</Desc></Code></AlertTriggerSrvResp></Body></Envelope>\r\n(au.com.stgeorge.sms.integration.IntegrationManager)\r\n"

doct = nlpt(sample)

In [104]:
for ent in doct.ents:
  print(ent.text, ent.label_)

2021- PLACE
07- PLACE
02 14:57:54:456 PLACE
INFO [ PLACE
WMQJCAResourceAdapter: PLACE
4] PLACE
=> PLACE
srvName : PLACE
AlertTriggerSrv request PLACE
: PLACE
Header><Commoninfo><Source> PLACE
ODS</Source><SourceBranch><Num>0</Num></SourceBranch> PLACE
>AlertTriggerSrv</Service></CommonInfo></Header><Body><AlertTriggerSr PLACE
>
< DATE
Applid> PLACE
CRA</Applid><AcctNum><AcctKey>4601841000280711</AcctKey></AcctNum> PLACE
AvailBal>5599.20</AvailBal><CurrBal>9348.30</CurrBal><ODLim>0.00</ODLim PLACE
>Priyesh PLACE
Dave</ShortName><TransType> PLACE
AU</TransType><TranEffDate> PLACE
Day>02</Day><Month>07</Month><Year>2021</Year></TranEffDate><TranEffTime><Hour>14</Hour> PLACE
Min>57</Min><Sec>54</Sec></TranEffTime><TranAmt>52.50</TranAmt><TranDesc1> PLACE
TRML= W1364081
 DATE
>WOOLWORTHS PLACE
/BEECROFTRDBEECROFTAU</TranDesc2 PLACE
>
 PLACE
SchedulePymtind> PLACE
N</SchedulePymtInd><Brand></Brand></AlertTriggerSrv></Body></Envelope> PLACE
response:<Envelope><Body><AlertTriggerSrvResp><Code>

In [105]:
spacy.displacy.render(doct, style="ent", jupyter=True) # display in Jupyter

In [106]:
ner_tagged = [(word.text, word.ent_type_) for word in doct] 

named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag:
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None

In [107]:
named_entities

[('2021 - 07 - 02 14:57:54:456 INFO [ WMQJCAResourceAdapter : 4 ] = > srvName : AlertTriggerSrv request :',
  'PLACE'),
 ('Header><Commoninfo><Source > ODS</Source><SourceBranch><Num>0</Num></SourceBranch >',
  'PLACE'),
 ('> AlertTriggerSrv</Service></CommonInfo></Header><Body><AlertTriggerSr > < Applid > CRA</Applid><AcctNum><AcctKey>4601841000280711</AcctKey></AcctNum >',
  'PLACE'),
 ('AvailBal>5599.20</AvailBal><CurrBal>9348.30</CurrBal><ODLim>0.00</ODLim',
  'PLACE'),
 ('> Priyesh Dave</ShortName><TransType > AU</TransType><TranEffDate >',
  'PLACE'),
 ('Day>02</Day><Month>07</Month><Year>2021</Year></TranEffDate><TranEffTime><Hour>14</Hour >',
  'PLACE'),
 ('Min>57</Min><Sec>54</Sec></TranEffTime><TranAmt>52.50</TranAmt><TranDesc1 > TRML= W1364081',
  'DATE'),
 ('> WOOLWORTHS / BEECROFTRDBEECROFTAU</TranDesc2 >', 'PLACE'),
 ('SchedulePymtind > N</SchedulePymtInd><Brand></Brand></AlertTriggerSrv></Body></Envelope > response:<Envelope><Body><AlertTriggerSrvResp><Code><Code>000</Co