https://manivannan-ai.medium.com/how-to-train-ner-with-custom-training-data-using-spacy-188e0e508c6

In [1]:
import spacy
import random
import re
import pandas as pd
pd.set_option('display.max_colwidth',None)


In [2]:
df = pd.read_csv('Generated_English_Queries_v3_without_keyword.csv',keep_default_na=False)
df.head()

Unnamed: 0,English_Queries,Task,Category,Sub_Category,Customer
0,display Max Shipment Cheese in year 2019?,Max Shipment,Cheese,,
1,generate Gross Sales Candy in year 2019?,Gross Sales,Candy,,
2,generate Revenue Gelatin in quarter Q4?,Revenue,,Gelatin,
3,get Max Shipment Aldi in quarter Q2 to Q4?,Max Shipment,,,Aldi
4,display Count Bacon in year 2019?,Count,,Bacon,


### Change data to spacy training format

In [3]:
def return_index(sent,word):
    #return sent.index(word)
    match = re.search(word, sent)
    if match:
        return (match.start(), match.end())

In [4]:
c = 0
data = []
for index,row in df.iterrows():
#     if c == 50:
#         break
#     c += 1
    text = row['English_Queries']
    entities_lst = []
    
    if row['Task'] != 'NA':
        task = row['Task']
        index_start,index_end =return_index(text,task)
        entities_lst.append( (index_start,index_end,'Task') )
 
    if row['Category'] != 'NA':
        category = row['Category']
        index_start,index_end =return_index(text,category)
        entities_lst.append( (index_start,index_end,'Category') )
    
    if row['Sub_Category'] != 'NA':
        sub = row['Sub_Category']
        index_start,index_end =return_index(text,sub)
        entities_lst.append( (index_start,index_end,'Sub_Category') )
        
    if row['Customer'] != 'NA':
        cust = row['Customer']
        index_start,index_end =return_index(text,cust)
        entities_lst.append( (index_start,index_end,'Customer') )
        
    
    tuple_data = (text, {'entities' : entities_lst })
    data.append(tuple_data)

In [5]:
data

[('display Max Shipment  Cheese in year 2019?',
  {'entities': [(8, 20, 'Task'), (22, 28, 'Category')]}),
 ('generate Gross Sales  Candy in year 2019?',
  {'entities': [(9, 20, 'Task'), (22, 27, 'Category')]}),
 ('generate Revenue  Gelatin in quarter Q4?',
  {'entities': [(9, 16, 'Task'), (18, 25, 'Sub_Category')]}),
 ('get Max Shipment  Aldi in quarter Q2 to Q4?',
  {'entities': [(4, 16, 'Task'), (18, 22, 'Customer')]}),
 ('display Count  Bacon in year 2019?',
  {'entities': [(8, 13, 'Task'), (15, 20, 'Sub_Category')]}),
 ('fetch Net sales  Candy in year 2019?',
  {'entities': [(6, 15, 'Task'), (17, 22, 'Category')]}),
 ('generate Revenue  Creamers in quarter Q3?',
  {'entities': [(9, 16, 'Task'), (18, 26, 'Sub_Category')]}),
 ('get Net sales  Bars/Snacks in quarter Q2 to Q4?',
  {'entities': [(4, 13, 'Task'), (15, 26, 'Sub_Category')]}),
 ('fetch Count  Creamers in week Wk31?',
  {'entities': [(6, 11, 'Task'), (13, 21, 'Sub_Category')]}),
 ('generate Net sales  Meats in quarter Q1 to

### Training

In [6]:
TRAIN_DATA = data

nlp = spacy.blank('en')  # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
       

# add labels
for _, annotations in TRAIN_DATA:
     for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [7]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
iterations = 20
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    
    for itn in range(iterations):
        print("Statring iteration " + str(itn))
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

  proc.begin_training(
  proc.begin_training(


Statring iteration 0
{'ner': 539.078071604505}
Statring iteration 1
{'ner': 8.093159410990522}
Statring iteration 2
{'ner': 0.012387057179723588}
Statring iteration 3
{'ner': 108.61125278398958}
Statring iteration 4
{'ner': 0.00028191408839848}
Statring iteration 5
{'ner': 0.0011553522642202118}
Statring iteration 6
{'ner': 67.61972112161425}
Statring iteration 7
{'ner': 5.9971986536485735}
Statring iteration 8
{'ner': 1.31534110109281e-06}
Statring iteration 9
{'ner': 63.742733820497776}
Statring iteration 10
{'ner': 4.947617364755552}
Statring iteration 11
{'ner': 2.7266675604888026e-05}
Statring iteration 12
{'ner': 8.341549517048614e-05}
Statring iteration 13
{'ner': 114.64027035482913}
Statring iteration 14
{'ner': 13.214697720386981}
Statring iteration 15
{'ner': 29.04505441085204}
Statring iteration 16
{'ner': 0.0025630773324564282}
Statring iteration 17
{'ner': 7.463689180085671e-09}
Statring iteration 18
{'ner': 2.9508983356218444e-05}
Statring iteration 19
{'ner': 37.95477442

In [9]:
# Save our trained Model
nlp.to_disk('spacy_trained_model_v2')



### Test Set Inference

In [16]:
df = pd.read_csv('Generated_English_Queries_v3_without_keyword_Test_Set.csv',keep_default_na=False)
df.head(2)

Unnamed: 0,English_Queries,Task,Category,Sub_Category,Customer
0,generate Max Shipment Candy in quarter Q2?,Max Shipment,Candy,,
1,generate Top Line Sauces in quarter Q1 to Q4?,Top Line,Sauces,,


In [17]:
entities = []
c = 1
for index,row in df.iterrows():
    values = ''
    doc = nlp(row['English_Queries'])
     
    for ent in doc.ents:
        values += ent.text + ':' + ent.label_ +' || '
    entities.append(values)

In [18]:
df['Entities'] = entities

In [19]:
df.head(50)

Unnamed: 0,English_Queries,Task,Category,Sub_Category,Customer,Entities
0,generate Max Shipment Candy in quarter Q2?,Max Shipment,Candy,,,Max Shipment:Task || Candy:Category ||
1,generate Top Line Sauces in quarter Q1 to Q4?,Top Line,Sauces,,,Top Line:Task || Sauces:Category ||
2,display Net sales Tea in month Nov?,Net sales,,Tea,,Net sales:Task || Tea:Sub_Category ||
3,get Top Line Ketchup in year 2019?,Top Line,,Ketchup,,Top Line:Task || Ketchup:Sub_Category ||
4,get Revenue Hot Dogs in month Aug?,Revenue,,Hot Dogs,,Revenue:Task || Hot Dogs:Sub_Category ||
5,fetch Bottom Line Candy in month Jun to Nov?,Bottom Line,Candy,,,Bottom Line:Task || Candy:Category ||
6,fetch Revenue Albertsons/Safeway in month Oct?,Revenue,,,Albertsons/Safeway,Revenue:Task || Albertsons/Safeway:Customer ||
7,fetch Top Line UNIFI in year 2019?,Top Line,,,UNIFI,Top Line:Task || UNIFI:Customer ||
8,fetch Revenue Meats in year 2019?,Revenue,Meats,,,Revenue:Task || Meats:Category ||
9,fetch Revenue Cheese in week Wk16 to Wk43?,Revenue,Cheese,,,Revenue:Task || Cheese:Category ||
