## Clean NER data

In [None]:
def removeDuplicates(data,colName):
    data.sort_values(colName, inplace = True) 
  
    # making a bool series 
    bool_series = data[colName].duplicated(keep = False) 

    # bool series 
    bool_series 

    # passing NOT of bool series to see unique values only 
    data = data[~bool_series] 

    return data

In [54]:
import pandas as pd 
data = pd.read_csv("genre.csv") 
data['name']=data['name'].str.lower()
df=removeDuplicates(data,"name")
df.to_csv("cleanData/Genre.csv",index=False)

Unnamed: 0,name
count,63
unique,63
top,video
freq,1


## Convert NER Data to Spacy format

Training data format is :
 [('what is the price of polo?', {'entities': [(21, 25, 'PrdName')]}), ...]

In [91]:
import spacy
import random
from spacy import displacy 

In [92]:
genra_data = pd.read_csv("cleanData/Genre.csv") 
style_data = pd.read_csv("cleanData/Style.csv") 
medium_data = pd.read_csv("cleanData/Medium.csv") 
genra_data['lable']="genra"
style_data['lable']="style"
medium_data['lable']="medium"
newLb=pd.concat([genra_data, style_data,medium_data], ignore_index=True)
newLb['name']=newLb['name'].str.strip()

In [93]:
TRAIN_DATA = []
for index, row in newLb.iterrows(): 
    name=row["name"]
    temp=(name,{'entities': [(0, len(name), row["lable"])]}) 
    TRAIN_DATA.append(temp)
TRAIN_DATA

[('abstract', {'entities': [(0, 8, 'genra')]}),
 ('advertisement', {'entities': [(0, 13, 'genra')]}),
 ('allegorical painting', {'entities': [(0, 20, 'genra')]}),
 ('animal painting', {'entities': [(0, 15, 'genra')]}),
 ('animation', {'entities': [(0, 9, 'genra')]}),
 ('architecture', {'entities': [(0, 12, 'genra')]}),
 ("artist's book", {'entities': [(0, 13, 'genra')]}),
 ('battle painting', {'entities': [(0, 15, 'genra')]}),
 ('bijinga', {'entities': [(0, 7, 'genra')]}),
 ('bird-and-flower painting', {'entities': [(0, 24, 'genra')]}),
 ('calligraphy', {'entities': [(0, 11, 'genra')]}),
 ('capriccio', {'entities': [(0, 9, 'genra')]}),
 ('caricature', {'entities': [(0, 10, 'genra')]}),
 ('cityscape', {'entities': [(0, 9, 'genra')]}),
 ('cloudscape', {'entities': [(0, 10, 'genra')]}),
 ('design', {'entities': [(0, 6, 'genra')]}),
 ('figurative', {'entities': [(0, 10, 'genra')]}),
 ('flower painting', {'entities': [(0, 15, 'genra')]}),
 ('genre painting', {'entities': [(0, 14, 'genra')]}

## Train Spacy Model

In [94]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.load('en_core_web_sm')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp
    


In [95]:
prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

Statring iteration 0
{'ner': 568.0723570561408}
Statring iteration 1
{'ner': 464.95748757173163}
Statring iteration 2
{'ner': 434.45754296321775}
Statring iteration 3
{'ner': 384.0656252154281}
Statring iteration 4
{'ner': 353.78662708248163}
Statring iteration 5
{'ner': 314.8090607273916}
Statring iteration 6
{'ner': 324.86414690081506}
Statring iteration 7
{'ner': 313.91948017633007}
Statring iteration 8
{'ner': 327.011448704648}
Statring iteration 9
{'ner': 298.8379854843043}
Statring iteration 10
{'ner': 301.0056355236438}
Statring iteration 11
{'ner': 318.87600132926025}
Statring iteration 12
{'ner': 328.3235500817014}
Statring iteration 13
{'ner': 317.2902033575198}
Statring iteration 14
{'ner': 310.44941263615294}
Statring iteration 15
{'ner': 305.31569095913767}
Statring iteration 16
{'ner': 323.56686644447353}
Statring iteration 17
{'ner': 286.5402217668015}
Statring iteration 18
{'ner': 261.300319013712}
Statring iteration 19
{'ner': 342.0833404644066}
Enter your Model Name: 

## Test the new Model

In [96]:
nlp = spacy.load("m2")
text = """But Google is starting from behind. The company made a late push into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption."""  
doc = nlp(text) 
displacy.render(doc, style="ent",jupyter=True)

In [97]:
import requests
import json

url="http://mec402.boisestate.edu/cgi-bin/assetSources/museums.py?q=lady&name=met&p=5&ps=1&type=1"
test_data = requests.get(url).json()
text=""
dd=test_data["data"][0]
# for d in dd:
#     text=text+str(dd[d])+"\n"
    
text=json.dumps(test_data)

doc = nlp(text.lower()) 
displacy.render(doc, style="ent",jupyter=True)