## Part of Speech

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u'The quick brown fox jumped over the lazy dog\'s back.')

In [5]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [6]:
print(doc[4])

jumped


In [7]:
print(doc[4].pos_)

VERB


In [9]:
print(doc[4].tag_)

VBD


In [15]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{30}}")

The        DET        DT         determiner                    
quick      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       NN         noun, singular or mass        
jumped     VERB       VBD        verb, past tense              
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner                    
lazy       ADJ        JJ         adjective (English), other noun-modifier (Chinese)
dog        NOUN       NN         noun, singular or mass        
's         PART       POS        possessive ending             
back       NOUN       NN         noun, singular or mass        
.          PUNCT      .          punctuation mark, sentence closer


In [16]:
doc = nlp(u'I read books on NLP.')

In [22]:
word = doc[1]

In [25]:
word.text

'read'

In [24]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{30}}")

read       VERB       VBD        verb, past tense              


In [None]:
doc = nlp(u'I read a book on NLP.')

In [26]:
word = doc[1]
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{30}}")

read       VERB       VBD        verb, past tense              


In [27]:
doc = nlp(u'The quick brown fox jumped over the lazy dog\'s back.')

In [28]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [29]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [31]:
doc.vocab[90].text

'DET'

In [33]:
for k, v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
97. PUNCT 1
100. VERB  1


In [34]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k, v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [36]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k, v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


## Visualizing Part of Speech

In [37]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [38]:
from spacy import displacy

In [39]:
displacy.render(doc,style='dep',jupyter=True)

In [40]:
options = {'distance':110, 'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times'}

In [41]:
displacy.render(doc,style='dep',jupyter=True, options=options)

In [None]:
doc2 = nlp(u"This is a sentence. This is a sentence, possibly longer than the other. ")

In [None]:
spans= list(doc2.sents)

In [None]:
displacy.serve(spans,style='dep', options={'distance':110})

## Named Entity Recognition

In [42]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found!')

In [43]:
doc = nlp(u'Hey how are you?')

In [44]:
show_ents(doc)

No entities found!


In [45]:
doc = nlp(u'May I go to Washington, D.C next May to see the Washington Manument?')

In [46]:
show_ents(doc)

Washington, D.C - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Manument - ORG - Companies, agencies, institutions, etc.


In [47]:
doc = nlp(u'Tesla to build a U.K factory for $6 million')

In [48]:
show_ents(doc)

U.K - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [50]:
from spacy.tokens import Span

In [52]:
ORG = doc.vocab.strings[u'ORG']

In [54]:
ORG

383

In [55]:
new_ent = Span(doc,0,1,label=ORG)

In [56]:
doc.ents = list(doc.ents) + [new_ent]

In [57]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


## Multiple Named Entity

In [59]:
doc = nlp(u'Our company created a brand new vacuum cleaner.'
         u'This new vacuum-cleaner is the best in show')

In [60]:
show_ents(doc)

No entities found!


In [61]:
from spacy.matcher import PhraseMatcher

In [62]:
matcher = PhraseMatcher(nlp.vocab)

In [63]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [64]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [66]:
matcher.add('newproduct',None,*phrase_patterns)

In [67]:
found_matches = matcher(doc)

In [68]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [69]:
from spacy.tokens import Span

In [70]:
PROD = doc.vocab.strings[u'PRODUCT']

In [71]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [74]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [75]:
doc.ents = list(doc.ents) + new_ents

In [76]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [78]:
doc = nlp(u'Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.')

In [79]:
[ent for ent in doc.ents if ent.label_ == 'MONEY']

[29.95, 10 dollars]

## Visualizing NER

In [82]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand ipods for a profit of $6 million.'
         u'By contrast, Sony only sold 8 thousand Walkman music players.')

In [83]:
displacy.render(doc,style='ent',jupyter=True)

In [85]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [96]:
colors= {'ORG':'red', 'DATE':'radial-gradient(yellow,red)', 'CARDINAL':'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
options= {'ents':['ORG', 'DATE', 'CARDINAL'], 'colors':colors}


In [97]:
displacy.render(doc,style='ent',jupyter=True,options = options)