In [77]:
import spacy

In [78]:
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp(u'Tesla is looking at buying U.S. start up for $6 million')

In [13]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN dobj
start VERB advcl
up PART prt
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [14]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x1f4e1122588>),
 ('parser', <spacy.pipeline.DependencyParser at 0x1f4e11c2b48>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x1f4e11c2ba0>)]

In [16]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [17]:
doc2 = nlp(u"Tesla isn't looking into startups anymore." )

In [19]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)


Tesla PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [21]:
doc2[0].pos_

'PROPN'

In [22]:
doc2[0].dep_

'nsubj'

In [24]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [25]:
life_quote = doc3[16:30]

In [26]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [27]:
type(life_quote)

spacy.tokens.span.Span

In [28]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [29]:
type(doc4)

spacy.tokens.doc.Doc

In [30]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [33]:
mystring = '"We\'re moving to L.A.!"'

In [36]:
print(mystring)

"We're moving to L.A.!"


In [37]:
doc = nlp(mystring)

In [38]:
for token in doc:
    print(token)

"
We
're
moving
to
L.A.
!
"


In [39]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [40]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [41]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [42]:
doc4.vocab

<spacy.vocab.Vocab at 0x1f4dfdf2048>

In [43]:
len(doc4.vocab)

57852

In [44]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')


In [45]:
for token in doc8:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [50]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [47]:
for entity in doc8.ents:
    print(entity)

Apple
Hong Kong
$6 million


In [51]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [53]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [54]:
import nltk

In [63]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [57]:
p_stemmer = PorterStemmer()

In [60]:
words = ['run', 'runner', 'runs', 'running', 'ran', 'easily', 'fiarly', 'Randy']

In [61]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))

run---->run
runner---->runner
runs---->run
running---->run
ran---->ran
easily---->easili
fiarly---->fiarli
Randy---->randi


In [66]:
s_stemmer = SnowballStemmer(language='english')

In [67]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

run---->run
runner---->runner
runs---->run
running---->run
ran---->ran
easily---->easili
fiarly---->fiar
Randy---->randi


In [68]:
words = ['generous','generation','generously','generate', 'general', 'General']

In [69]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

generous---->generous
generation---->generat
generously---->generous
generate---->generat
general---->general
General---->general


In [70]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [71]:
for t in doc1:
    print(t.text,'\t', t.pos_,'\t', t.lemma, '\t', t.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [72]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [73]:
show_lemmas(doc1)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        ADP    10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


In [74]:
from spacy.matcher import Matcher

In [92]:
matcher = Matcher(nlp.vocab)

In [76]:
pattern1 = [{'LOWER':'solarpower'}]

In [77]:
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]

In [78]:
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [79]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [80]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-power is great")

In [85]:
found_matches = matcher(doc)

In [83]:
print(fount_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [86]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [87]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [95]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [89]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [90]:
# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]


In [96]:
# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)


In [97]:
found_matches = matcher(doc3)

In [98]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3473369816841043438 VoodooEconomics 41 45 
3473369816841043438 VoodooEconomics 49 53 
3473369816841043438 VoodooEconomics 54 56 
3473369816841043438 VoodooEconomics 61 65 
3473369816841043438 VoodooEconomics 673 677 
3473369816841043438 VoodooEconomics 2985 2989 


In [99]:
doc = nlp(u"The quick brown fox jumped over the lazy dog")

In [54]:
from spacy import displacy

In [101]:
displacy.render(doc, style='dep',jupyter=True)

In [104]:
options = {'distance':110,'compact':'True', 'color':'yellow','bg':'#E64A19','font':'algeria'}

In [105]:
displacy.render(doc, style='dep',jupyter=True,options=options)

In [106]:
doc2 = nlp(u'This is a sentence. This is another sentence. THis is a terrily long sentence')

In [107]:
spans = list(doc2.sents)

In [None]:
displacy.serve(spans,style='dep',options={'distance':110})


    Serving on port 5000...
    Using the 'dep' visualizer



127.0.0.1 - - [21/Jul/2019 01:21:22] "GET / HTTP/1.1" 200 10079
127.0.0.1 - - [21/Jul/2019 01:21:22] "GET /favicon.ico HTTP/1.1" 200 10079


In [1]:
 127.0.0.1:5000

SyntaxError: invalid syntax (<ipython-input-1-db74db3d53b0>, line 1)

In [12]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ ent.label_ + ' - '+ str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [13]:
doc = nlp(u'Hi how are you?')

In [14]:
show_ents(doc)

No entities found


In [17]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

In [18]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [28]:
doc= nlp (u"Our company created a brand new vaccum cleaner. This new vaccum-cleaner was really good")

In [29]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [31]:
phrase_list = ['vaccum cleaner', 'vaccum-cleaner']

In [32]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [33]:
matcher.add('newproduct',None,*phrase_patterns)

In [34]:
found_matches = matcher(doc)

In [35]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [38]:
from spacy.tokens import Span

In [40]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [42]:
new_ents = [Span(doc,match[1],match[2], label = PROD) for match in found_matches]

In [43]:
new_ents

[vaccum cleaner, vaccum-cleaner]

In [44]:
doc.ents = list(doc.ents) + new_ents

In [45]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [47]:
doc = nlp(u'Orignally i paid a $ 29.90 fo rht ecar, but now it is marked for ten dollors. In future it would cost millions')

In [51]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

1

In [83]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 millions. \n By contrast; Sony only sold 8 thousand Walkman music players. \n\nOn the other hand; the movie Players made a million dollars")

In [63]:
displacy.render(doc, style='ent', jupyter = True)

In [64]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter = True)

In [67]:
options = {'ents':['PRODUCT','ORG']}

In [68]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter = True, options = options)

In [69]:
for sent in doc.sents:
    print(sent)

Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 millions.
By contrast, Sony only sold 8 thousand Walkman music players.
On the other hand, the movie Players made a million dollars


In [72]:
def set_custom_boundaries(doc):
    for t in doc:
        print(t,'\t',t.i)

In [73]:
set_custom_boundaries(doc)

Over 	 0
the 	 1
last 	 2
quarter 	 3
Apple 	 4
sold 	 5
nearly 	 6
20 	 7
thousand 	 8
iPods 	 9
for 	 10
a 	 11
profit 	 12
of 	 13
$ 	 14
6 	 15
millions 	 16
. 	 17
By 	 18
contrast 	 19
, 	 20
Sony 	 21
only 	 22
sold 	 23
8 	 24
thousand 	 25
Walkman 	 26
music 	 27
players 	 28
. 	 29
On 	 30
the 	 31
other 	 32
hand 	 33
, 	 34
the 	 35
movie 	 36
Players 	 37
made 	 38
a 	 39
million 	 40
dollars 	 41


In [79]:
from spacy.pipeline import SentenceSegmenter

In [82]:
def split_on_newlines(doc):
    start = 0
    seen_newline = false
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline  = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [84]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [85]:
nlp.add_pipe(sbd)

In [86]:
for sentence in doc.sents:
    print(sentence)

Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 millions. 
 
By contrast; Sony only sold 8 thousand Walkman music players. 


On the other hand; the movie Players made a million dollars
