In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [5]:
doc[4]

jumped

In [6]:
 doc[4].pos_

'VERB'

In [7]:
doc[4].tag_ #Fine grained. VBD means past tense verb

'VBD'

In [9]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [10]:
doc = nlp(u"I read books on NLP.")

In [11]:
word = doc[1]

In [12]:
word.text

'read'

In [13]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

I          PRON       PRP        pronoun, personal
read       VERB       VBP        verb, non-3rd person singular present
books      NOUN       NNS        noun, plural
on         ADP        IN         conjunction, subordinating or preposition
NLP        PROPN      NNP        noun, proper singular
.          PUNCT      .          punctuation mark, sentence closer


"read" is treated as a present verb, not past

In [14]:
doc = nlp(u"I read a book on NLP.")

In [15]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

I          PRON       PRP        pronoun, personal
read       VERB       VBD        verb, past tense
a          DET        DT         determiner
book       NOUN       NN         noun, singular or mass
on         ADP        IN         conjunction, subordinating or preposition
NLP        PROPN      NNP        noun, proper singular
.          PUNCT      .          punctuation mark, sentence closer


Now "read" is past tense.

In [16]:
doc = nlp(u"I read books on NLP in 2018.")

In [17]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

I          PRON       PRP        pronoun, personal
read       VERB       VBP        verb, non-3rd person singular present
books      NOUN       NNS        noun, plural
on         ADP        IN         conjunction, subordinating or preposition
NLP        PROPN      NNP        noun, proper singular
in         ADP        IN         conjunction, subordinating or preposition
2018       NUM        CD         cardinal number
.          PUNCT      .          punctuation mark, sentence closer


Now it incorrectly says "read" is present, when the 2018 context shows it happened in the past.

In [18]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [24]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [25]:
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [26]:
for k, v in sorted(POS_counts.items()):
    print(f"{k} {doc.vocab[k].text:{5}} {v}")

83 ADJ   3
84 ADP   1
89 DET   2
91 NOUN  3
93 PART  1
96 PUNCT 1
99 VERB  1


In [27]:
from spacy import displacy

In [28]:
displacy.render(doc, style='dep', jupyter=True)

In [31]:
options = {'distance':100, 'compact':'True', 'color':'blue', 'bg':'grey', 'font':'Times'}

In [32]:
displacy.render(doc, style='dep', jupyter=True, options=options)

In [33]:
doc2 = nlp(u"This is a sentence. This is another sentence, possibly longer than the first.")

In [34]:
spans = list(doc2.sents)

In [None]:
displacy.serve(spans, style='dep', options=options)


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [26/Jun/2023 15:57:03] "GET / HTTP/1.1" 200 9817
127.0.0.1 - - [26/Jun/2023 15:57:04] "GET /favicon.ico HTTP/1.1" 200 9817
127.0.0.1 - - [26/Jun/2023 15:57:04] "GET /~@fontsource/roboto/400.css HTTP/1.1" 200 9817
127.0.0.1 - - [26/Jun/2023 15:57:04] "GET /~@fontsource/roboto/700.css HTTP/1.1" 200 9817
127.0.0.1 - - [26/Jun/2023 15:57:04] "GET /assets/images/clockify_logo_dark.svg HTTP/1.1" 200 9817



    Shutting down server on port 5000.



### Named Entity Recognition

In [39]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + spacy.explain(ent.label_))
    else:
        print("No ents")

In [40]:
doc = nlp(u"May I go to Washington DC next May to see the Lincoln Memorial?")

In [41]:
show_ents(doc)

Washington DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Lincoln Memorial - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [42]:
doc = nlp(u"Can I please have $500 of Microsoft stock?")

In [43]:
show_ents(doc)

500 - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [54]:
doc = nlp(u"Tesla to build a U.K. factory for $6 million")

In [55]:
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [56]:
from spacy.tokens import Span

In [57]:
ORG = doc.vocab.strings[u"ORG"]

In [58]:
ORG

381

In [59]:
new_ent = Span(doc, 0, 1, label=ORG)

In [60]:
doc.ents = list(doc.ents) + [new_ent]

In [61]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [62]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

In [63]:
show_ents(doc)

No ents


In [64]:
from spacy.matcher import PhraseMatcher

In [65]:
matcher = PhraseMatcher(nlp.vocab)

In [66]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [67]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [68]:
matcher.add('newproduct', None, *phrase_patterns)

In [69]:
found_matches = matcher(doc)

In [70]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [71]:
from spacy.tokens import Span

In [72]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [73]:
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]

In [74]:
doc.ents = list(doc.ents) + new_ents

In [75]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [80]:
doc = nlp(u"Originally I paid $29.95 for this toy, but now it's marked down by 20 dollars")

In [81]:
show_ents(doc)

29.95 - MONEY - Monetary values, including unit
20 dollars - MONEY - Monetary values, including unit


In [82]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 20 dollars]

### Visualize NER

In [85]:
doc = nlp("Over the last quarter, Apple sold 211 thousand iPods for $4 million"
         u"By contrast, Sony only sold 7 thousand Walkmans.")

In [86]:
displacy.render(doc, style='ent', jupyter=True)

It didn't parse out iPods correctly

In [87]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

Looks the same for me, not sure why...

In [89]:
options = {'ents':['ORG']}

In [90]:
displacy.render(doc, style='ent', jupyter=True, options=options)

There are also options to change colors. As always, you can use `displacy.serve()` instead.

### Sentence Segmentation

In [91]:
doc = nlp(u"This is the 1st sentence. This is the 2nd sentence. And this is the third and final sentence.")

In [92]:
for sent in doc.sents:
    print(sent)

This is the 1st sentence.
This is the 2nd sentence.
And this is the third and final sentence.


In [94]:
doc.sents[0] # it is a generator, not a list

TypeError: 'generator' object is not subscriptable

In [95]:
list(doc.sents)[0]

This is the 1st sentence.

In [96]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [97]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [98]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [99]:
for sent in doc.sents:
    print(sent)

"Management is doing the right things; leadership is doing the right things."
- Peter Drucker


In [103]:
# Add a segmentation rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [None]:
nlp.add_pipe(set_custom_boundaries, before='parser')

In [109]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [110]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [111]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things.
" - Peter Drucker


Unclear why the " is now part of the last sentence...

In [112]:
# Change seg rules
nlp = spacy.load('en_core_web_sm')

In [113]:
mystring = u"This is a sentence. Followed by a second.\n\n This sentence\n is split."

In [115]:
print(mystring)

This is a sentence. Followed by a second.

 This sentence
 is split.


In [116]:
doc = nlp(mystring)

In [117]:
for sent in doc.sents:
    print(sent)

This is a sentence.
Followed by a second.

 
This sentence
 is split.


In [118]:
from spacy.pipeline import SentenceSegmenter

In [119]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
        
    yield doc[start:]

In [120]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [121]:
nlp.add_pipe(sbd)

In [122]:
doc = nlp(mystring)

In [123]:
for sent in doc.sents:
    print(sent)

This is a sentence. Followed by a second.

 
This sentence
 
is split.
