# NLP Using spaCy

In [2]:
# ! pip install spacy
# ! pip install bs4
# ! pip install requests
# !python -m spacy download en_core_web_sm

In [6]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from bs4 import BeautifulSoup
import requests
import re

In [7]:
# Load the spacY en_core_web_sm model
nlp = en_core_web_sm.load()

In [25]:
trading = "I will now confirm your order you want to buy a hundred shares of Apple stock at $350 each"
trade = nlp(trading)

In [26]:
for ent in trade.ents:
    print(ent.text, ent.label_)

a hundred CARDINAL
Apple ORG
350 MONEY


In [27]:
labels = [x.label_ for x in trade.ents]
Counter(labels)

Counter({'CARDINAL': 1, 'ORG': 1, 'MONEY': 1})

In [28]:
items = [x.text for x in trade.ents]
Counter(items).most_common(3)

[('a hundred', 1), ('Apple', 1), ('350', 1)]

In [29]:
sentences = [x for x in trade.sents]
print(sentences[0:3])

[I will now confirm your order you want to buy a hundred shares of Apple stock at $350 each]


In [30]:
#Visualizers
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [31]:
displacy.render(nlp(str(sentences)), style='dep', jupyter = True, options = {'distance': 120})

### Named Entity Recogniser (NER)

In [16]:
# !python -m spacy download en

In [17]:
import numpy as np
import pandas as pd

nlp = spacy.load('en')
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)
words = []
labels = []

for token in doc:
    words.append(token.text)
    labels.append('O') # As most of token will be non-entity (OUT). Replace this later with actual entity a/c the scheme.

df = pd.DataFrame({'word': words, 'label': labels})
df.to_csv('ner-token-per-line.biluo', index=False) # biluo in extension to indicate the type of encoding, it is ok to keep csv

In [18]:
dpath = 'ner-token-per-line.biluo'

df = pd.read_csv(dpath, sep=',')
words  = df.word.values
ents = df.label.values
text = ' '.join(words)

from spacy.gold import GoldParse

doc = nlp.make_doc(text) # A function that takes text and returns a Doc object. Usually a Tokenizer.
g = GoldParse(doc, entities=ents)
X = [doc]
Y = [g]

In [39]:
add_ents = ['DATED'] # The new entity
# Piplines in core pretrained model are tagger, parser, ner. Create new if blank model is to be trained using `spacy.blank('en')` else get the existing one.
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner") # "architecture": "ensemble" simple_cnn ensemble,  # https://spacy.io/api/annotation
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe("ner")
prev_ents = ner.move_names # All the existing entities recognised by the model
print('[Existing Entities] = ', ner.move_names)
for ent in add_ents:
    ner.add_label(ent)
    
new_ents = ner.move_names
print('\n[All Entities] = ', ner.move_names)
print('\n\n[New Entities] = ', list(set(new_ents) - set(prev_ents)))
## Training
model = None # Since we are training a fresh model not a saved model
n_iter = 20
with nlp.disable_pipes():  # only train ner *other_pipes
    # optimizer = nlp.begin_training()
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    for i in range(n_iter):
        losses = {}
        nlp.update(X, Y,  sgd=optimizer, drop=0.0, losses=losses)
        # nlp.entity.update(d, g)
        print("Losses", losses)

[Existing Entities] =  ['B-ORG', 'B-DATE', 'B-PERSON', 'B-GPE', 'B-MONEY', 'B-CARDINAL', 'B-NORP', 'B-PERCENT', 'B-WORK_OF_ART', 'B-LOC', 'B-TIME', 'B-QUANTITY', 'B-FAC', 'B-EVENT', 'B-ORDINAL', 'B-PRODUCT', 'B-LAW', 'B-LANGUAGE', 'I-ORG', 'I-DATE', 'I-PERSON', 'I-GPE', 'I-MONEY', 'I-CARDINAL', 'I-NORP', 'I-PERCENT', 'I-WORK_OF_ART', 'I-LOC', 'I-TIME', 'I-QUANTITY', 'I-FAC', 'I-EVENT', 'I-ORDINAL', 'I-PRODUCT', 'I-LAW', 'I-LANGUAGE', 'L-ORG', 'L-DATE', 'L-PERSON', 'L-GPE', 'L-MONEY', 'L-CARDINAL', 'L-NORP', 'L-PERCENT', 'L-WORK_OF_ART', 'L-LOC', 'L-TIME', 'L-QUANTITY', 'L-FAC', 'L-EVENT', 'L-ORDINAL', 'L-PRODUCT', 'L-LAW', 'L-LANGUAGE', 'U-ORG', 'U-DATE', 'U-PERSON', 'U-GPE', 'U-MONEY', 'U-CARDINAL', 'U-NORP', 'U-PERCENT', 'U-WORK_OF_ART', 'U-LOC', 'U-TIME', 'U-QUANTITY', 'U-FAC', 'U-EVENT', 'U-ORDINAL', 'U-PRODUCT', 'U-LAW', 'U-LANGUAGE', 'O', 'B-DATED', 'I-DATED', 'L-DATED', 'U-DATED']

[All Entities] =  ['B-ORG', 'B-DATE', 'B-PERSON', 'B-GPE', 'B-MONEY', 'B-CARDINAL', 'B-NORP', 'B-P

In [23]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Sebastian NORP
Google ORG
2007 DATE
American NORP
Recode ORG
earlier this week DATE
