https://manivannan-ai.medium.com/how-to-train-ner-with-custom-training-data-using-spacy-188e0e508c6

In [2]:
import spacy
import random
import re
import pandas as pd
pd.set_option('display.max_colwidth',None)


In [3]:
df = pd.read_csv('Generated_English_Queries_v2.csv',keep_default_na=False)
df.head()

Unnamed: 0,English_Queries,Task,Category,Sub_Category,Customer
0,generate Revenue for category Meats in month Mar?,Revenue,Meats,,
1,generate Count for Sub_category Coffee in month Jun to Nov?,Count,,Coffee,
2,get Top Line for the Customer Dollar General in week Wk52?,Top Line,,,Dollar General
3,fetch Gross Sales for category Cheese in month Jan to Sep?,Gross Sales,Cheese,,
4,fetch Count for the Customer Amazon in month Nov?,Count,,,Amazon


### Change data to spacy training format

In [5]:
def return_index(sent,word):
    match = re.search(word, sent)
    if match:
        return (match.start(), match.end())

In [6]:
data = []
for index,row in df.iterrows():
    text = row['English_Queries']
    entities_lst = []
    
    if row['Task'] != 'NA':
        task = row['Task']
        index_start,index_end =return_index(text,task)
        entities_lst.append( (index_start,index_end,'Task') )
 
    if row['Category'] != 'NA':
        category = row['Category']
        index_start,index_end =return_index(text,category)
        entities_lst.append( (index_start,index_end,'Category') )
    
    if row['Sub_Category'] != 'NA':
        sub = row['Sub_Category']
        index_start,index_end =return_index(text,sub)
        entities_lst.append( (index_start,index_end,'Sub_Category') )
        
    if row['Customer'] != 'NA':
        cust = row['Customer']
        index_start,index_end =return_index(text,cust)
        entities_lst.append( (index_start,index_end,'Customer') )
        
    
    tuple_data = (text, {'entities' : entities_lst })
    data.append(tuple_data)

### Training

In [7]:
TRAIN_DATA = data

nlp = spacy.blank('en')  # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
       

# add labels
for _, annotations in TRAIN_DATA:
     for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [9]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
iterations = 20
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    
    for itn in range(iterations):
        print("Statring iteration " + str(itn))
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

Statring iteration 0
{'ner': 369.3214765630943}
Statring iteration 1
{'ner': 0.0036256464860126014}
Statring iteration 2
{'ner': 8.975363776911028e-06}
Statring iteration 3
{'ner': 1.6109575917262117e-05}
Statring iteration 4
{'ner': 9.504296465345973e-06}
Statring iteration 5
{'ner': 6.283862020317652e-08}
Statring iteration 6
{'ner': 1.2475043666879122e-07}
Statring iteration 7
{'ner': 5.8372767241773284e-08}
Statring iteration 8
{'ner': 1.6873428372945908e-07}
Statring iteration 9
{'ner': 2.538000194455042e-08}
Statring iteration 10
{'ner': 169.28002225574528}
Statring iteration 11
{'ner': 2.175429562150578}
Statring iteration 12
{'ner': 7.563043569294473e-05}
Statring iteration 13
{'ner': 7.618729487097778}
Statring iteration 14
{'ner': 27.936489719120473}
Statring iteration 15
{'ner': 23.82699634333346}
Statring iteration 16
{'ner': 71.36355562442782}
Statring iteration 17
{'ner': 11.571996534374831}
Statring iteration 18
{'ner': 12.778544260329063}
Statring iteration 19
{'ner': 1

In [10]:
# Save our trained Model
nlp.to_disk('spacy_trained_model_v1')



### Test Set Inference

In [40]:
df = pd.read_csv('Generated_English_Queries_Test_Set.csv',keep_default_na=False)
df.head(2)

Unnamed: 0,English_Queries,Task,Category,Sub_Category,Customer
0,get Revenue for Sub_category Shelf Stable in month Mar?,Revenue,,Shelf Stable,
1,fetch Top Line for Sub_category Juice in week Wk25 to Wk38?,Top Line,,Juice,


In [60]:
entities = []
c = 1
for index,row in df.iterrows():
    values = ''
    doc = nlp(row['English_Queries'])
     
    for ent in doc.ents:
        values += ent.text + ':' + ent.label_ +' || '
    entities.append(values)

In [61]:
df['Entities'] = entities

In [62]:
df.head(50)

Unnamed: 0,English_Queries,Task,Category,Sub_Category,Customer,Entities
0,get Revenue for Sub_category Shelf Stable in month Mar?,Revenue,,Shelf Stable,,Revenue:Task || Shelf Stable:Sub_Category ||
1,fetch Top Line for Sub_category Juice in week Wk25 to Wk38?,Top Line,,Juice,,Top Line:Task || Juice:Sub_Category ||
2,display Top Line for the Customer Target in year 2019?,Top Line,,,Target,Top Line:Task || Target:Customer ||
3,fetch Top Line for category Candy in quarter Q1 to Q4?,Top Line,Candy,,,Top Line:Task || Candy:Category ||
4,generate Max Shipment for the Customer Albertsons/Safeway in quarter Q1?,Max Shipment,,,Albertsons/Safeway,Max Shipment:Task || Albertsons/Safeway:Customer ||
5,get Count for the Customer Dollar General in year 2019?,Count,,,Dollar General,Count:Task || Dollar General:Customer ||
6,get Top Line for Sub_category Ketchup in year 2019?,Top Line,,Ketchup,,Top Line:Task || Ketchup:Sub_Category ||
7,fetch Net sales for category Cheese in week Wk24 to Wk44?,Net sales,Cheese,,,Net sales:Task || Cheese:Category ||
8,generate Revenue for the Customer Costco in month Aug?,Revenue,,,Costco,Revenue:Task || Costco:Customer ||
9,get Count for Sub_category Juice in quarter Q2 to Q4?,Count,,Juice,,Count:Task || Juice:Sub_Category ||
