## Create NER model using Spacy 

## Exercise 2
* Write a function that takes OfferDetails as input, and returns Product as output (Product can be single or array)
* Run the function against all rows in the attached data and determine function accuracy (Higher the better)

__Hint__ - For Exercise 2, build a corpus of products (built manually) and go from there. Mention the order of complexity (in any measures Big O, memory,
cpu etc.)

For Exercise 2, our expectation is that you implement NER model


In [63]:
#Import all required libraries
import pandas as pd
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding
import sys
from spacy import displacy
from itertools import chain
import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator

import warnings
warnings.filterwarnings("ignore")

nlp = spacy.load("en_core_web_md")

In [64]:
def load_data_spacy(file_path):
    ''' Converts data from:
    word \t label \n word \t label \n \n word \t label
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    start =0
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[1]
            if(label != 'O'):
                label = line[1]+"_Product"     # the .txt is formatted: label \t word, label[0:2] = label_type
            #label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1)  # length of the word + trailing space
           
            if label == 'I_Product' :  # if at the end of an annotation
                entities.append(( start,end-1, label))  # append the annotation
                              
            if label == 'B_Product':                         # if beginning new annotation
                entities.append(( start,end-1, label))# start annotation at beginning of word
                
           
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if(len(entities) > 0):
                sentence = " ".join(sentence)
                training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0 
            start = 0
            entities, sentence = [], []
            
    file.close()
    return training_data, unique_labels

In [65]:
TRAIN_DATA, LABELS = load_data_spacy("train.tsv")
print(TRAIN_DATA[0])
print(len(TRAIN_DATA))



VALID_DATA, _ = load_data_spacy("train.tsv")
print(len(VALID_DATA))
#python -m spacy download en_core_web_md

TEST_DATA, _ = load_data_spacy("test.tsv")
print(len(TEST_DATA))

['Save $ 2.00 ONE Downy Liquid Fabric Conditioner 72 ld or larger ( includes Downy Odor Protect 48 oz or larger OR Downy Wrinkle Guard 40 oz or larger OR Downy Nature Blends 67 oz or larger ) OR Bounce / Downy Sheets 130 ct or larger ( includes Bounce / Downy Wrinkle Guard 80 ct or larger ) OR In Wash Scent Boosters 8.6 oz or larger ( includes Downy Unstopables , Fresh Protect , Odor Protect , and Infusions ) ( excludes Downy Libre Enjuague , Gain Fireworks , and trial / travel size ) .', {'entities': [(16, 21, 'B_Product'), (22, 28, 'I_Product'), (29, 35, 'I_Product'), (36, 47, 'I_Product'), (75, 80, 'B_Product'), (81, 85, 'I_Product'), (86, 93, 'I_Product'), (113, 118, 'B_Product'), (119, 126, 'I_Product'), (127, 132, 'I_Product'), (152, 157, 'B_Product'), (158, 164, 'I_Product'), (165, 171, 'I_Product'), (193, 199, 'B_Product'), (200, 201, 'I_Product'), (202, 207, 'I_Product'), (208, 214, 'I_Product'), (243, 249, 'B_Product'), (250, 251, 'I_Product'), (252, 257, 'I_Product'), (258, 

In [66]:

def calc_precision(pred, true):        
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) # true positives / total pred
    return precision

def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)    # true positives / total test
    return recall

def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1

In [67]:
# run the predictions on each sentence in the evaluation  dataset, and return the metrics

def evaluate(ner, data ):
    '''
    '''
    preds = [ner(x[0]) for x in data]

    precisions, recalls, f1s = [], [], []

    # iterate over predictions and test data and calculate precision, recall, and F1-score
    for pred, true in zip(preds, data):
        true = [x[2] for x in list(chain.from_iterable(true[1].values()))] # x[2] = annotation, true[1] = (start, end, annot)
        pred = [i.label_ for i in pred.ents] # i.label_ = annotation label, pred.ents = list of annotations
        precision = calc_precision(true, pred)
        precisions.append(precision)
        recall = calc_recall(true, pred)
        recalls.append(recall)
        f1s.append(calc_f1(precision, recall))

    #print("Precision: {} \nRecall: {} \nF1-score: {}".format(np.around(np.mean(precisions), 3),np.around(np.mean(recalls), 3),
    #                                                         np.around(np.mean(f1s), 3)))
    return {"textcat_p": np.mean(precisions), "textcat_r": np.mean(recalls), "textcat_f":np.mean(f1s)}

In [68]:
# https://aihub.cloud.google.com/p/products%2F2290fc65-0041-4c87-a898-0289f59aa8ba


def train_spacy(train_data, labels, iterations, dropout = 0.5, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    valid_f1scores=[]
    test_f1scores=[]
    nlp = spacy.load("en_core_web_md")
    #nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe("ner")
   
    # Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)

    # Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        #nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(16.0, 64.0, 1.5))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations,
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)
            #if itr % display_freq == 0:
            #    print("Iteration {} Loss: {}".format(itr + 1, losses))
            scores = evaluate(nlp,VALID_DATA)
            valid_f1scores.append(scores["textcat_f"])
            print('=======================================')
            print('Interation = '+str(itr))
            print('Losses = '+str(losses))
            print('===============VALID DATA========================')
            
            print('F1-score = '+str(scores["textcat_f"]))
            print('Precision = '+str(scores["textcat_p"]))
            print('Recall = '+str(scores["textcat_r"]))
            scores = evaluate(nlp,TEST_DATA)
            test_f1scores.append(scores["textcat_f"])
            print('===============TEST DATA========================')
            print('F1-score = '+str(scores["textcat_f"]))
            print('Precision = '+str(scores["textcat_p"]))
            print('Recall = '+str(scores["textcat_r"]))
            print('=======================================')
            
    return nlp,valid_f1scores,test_f1scores

In [69]:
ner,valid_f1scores,test_f1scores = train_spacy(TRAIN_DATA, LABELS,20)


Interation = 0
Losses = {'ner': 20262.38423538208}
F1-score = 0.0
Precision = 0.0
Recall = 0.0
F1-score = 0.0
Precision = 0.0
Recall = 0.0
Interation = 1
Losses = {'ner': 16458.64696121216}
F1-score = 0.0
Precision = 0.0
Recall = 0.0
F1-score = 0.0
Precision = 0.0
Recall = 0.0
Interation = 2
Losses = {'ner': 14840.8837184906}
F1-score = 0.10110059971204047
Precision = 0.09278965638892295
Recall = 0.11283783783783784
F1-score = 0.05752249628845546
Precision = 0.0527079872646733
Recall = 0.06395348837209303
Interation = 3
Losses = {'ner': 14412.98419380188}
F1-score = 0.6559018060462889
Precision = 0.6139206811748842
Recall = 0.7197822822822824
F1-score = 0.5785555224893232
Precision = 0.5570733422477608
Recall = 0.6104651162790697
Interation = 4
Losses = {'ner': 13729.522495269775}
F1-score = 0.9393205893205895
Precision = 0.9391843629343628
Recall = 0.941554054054054
F1-score = 0.8749354005167959
Precision = 0.8786821705426356
Recall = 0.8759689922480619
Interation = 5
Losses = {'ner':

In [72]:
## Store custom spacy model
ner.to_disk("spacy_example")

In [71]:
# Read the spacy model
ner = ner.from_disk("spacy_example")


In [93]:

doc = ner('BUY TWO GET ONE Buy ONE (1) CHIPS AHOY! Cookies (9.5-19.5 oz.) and ONE (1) SOUR PATCH KIDS Candy (4 oz. or larger), get ONE (1) SOUR PATCH KIDS CHIPS AHOY! Cookies FREE (8 oz.;)')
#doc = ner()
def product_ner(sentence):
    '''
        Function take sentence as product detail and return the Product name
    '''
    product_list = []
    product_name = []
    doc = ner(sentence)
    
    for ent in doc.ents:
        
        if ent.label_ == 'B_Product':
            if len(product_name) > 0:
                product_list.append(' '.join(product_name))
                product_name = []
            product_name.append(ent.text)
        else:
            product_name.append(ent.text)
    product_list.append(' '.join(product_name))
    return product_list


In [56]:
#print(ent.text, ent.start_char, ent.end_char, ent.label_)
print (product_ner("Save $3.00 on Herbal Essences when you buy TWO (2) Herbal Essences bio:renew Shampoo, Conditioner OR Styling Products (excludes Masks, 100 mL Shampoo and Conditioners, Color, Body Wash and trial/travel size)."))

['Herbal Essences', 'Herbal Essences bio : renew Shampoo , Conditioner OR Styling Products', 'Masks']


In [57]:
print (product_ner("SAVE $1.00 TWO Old Spice Anti-Perspirant/Deodorant, Body Wash OR Bar Soap (excludes twin packs, High Endurance, Body Spray and Invisible Spray and trial/travel size)."))

['Old Spice Anti - Perspirant / Deodorant , Body Wash', 'Bar Soap']


In [73]:
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")

test_sentences = ['Save $2.00 ONE Downy Liquid Fabric Conditioner 72 ld or larger (includes Downy Odor Protect 48 oz or larger OR Downy Wrinkle Guard 40 oz or larger OR Downy Nature Blends 67 oz or larger) OR Bounce/Downy Sheets 130 ct or larger (includes Bounce/Downy Wrinkle Guard 80 ct or larger) OR In Wash Scent Boosters 8.6 oz or larger (includes Downy Unstopables, Fresh Protect, Odor Protect, and Infusions) (excludes Downy Libre Enjuague, Gain Fireworks, and trial/travel size).']
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")
warnings.filterwarnings("default")

In [74]:
data = pd.read_csv('coupons_ner.csv', names = ['OfferDetails'])
data.head()

Unnamed: 0,OfferDetails
0,Save $2.00 ONE Downy Liquid Fabric Conditioner...
1,Save $2.00 ONE Tide PODS OR Tide Power PODS (e...
2,Save $2.00 ONE Tide Laundry Detergent (exclude...
3,SAVE $1.00 ON TWO when you buy TWO BOXES (8.9 ...
4,$3.00 OFF when you purchase any THREE (3) Pepp...


In [94]:
data['Product'] = data['OfferDetails'].apply(lambda sentence: product_ner(sentence))

data.head(20)

Unnamed: 0,OfferDetails,Product
0,Save $2.00 ONE Downy Liquid Fabric Conditioner...,"[Downy Liquid Fabric Conditioner, Downy Odor P..."
1,Save $2.00 ONE Tide PODS OR Tide Power PODS (e...,"[Tide PODS, Tide Power PODS, Tide Liquid / Pow..."
2,Save $2.00 ONE Tide Laundry Detergent (exclude...,"[Tide Laundry Detergent, Tide Purclean Laundry..."
3,SAVE $1.00 ON TWO when you buy TWO BOXES (8.9 ...,[General Mills cereal]
4,$3.00 OFF when you purchase any THREE (3) Pepp...,"[Pepperidge Farm ® Farmhouse buns , bread]"
5,SAVE $1.11 when you buy any ONE (1) Familly Si...,"[Familly Size OREO , CHIPS AHOY ! or NUTTER BU..."
6,SAVE $1.00 ON TWO when you buy TWO PACKAGES an...,"[Nature Valley ™ Granola Bars , Biscuits]"
7,Save $1.00 on any TWO (2) Sargento® Natural Ch...,[Sargento ® Natural Cheese Slices]
8,$0.65 OFF On Any ONE (1) Oikos Greek Yogurt Cu...,"[Oikos Greek Yogurt Cup, Oikos Triple Zero]"
9,$2.00 OFF ONE (1) SMALL bag of Eight O'Clock® ...,"[Eight O'Clock ® Coffee, Barista Blends]"


## Conclusion:
    1. It show good result regarding to the Product.
    2. I got 97% accuracy on the test data.