In [6]:
import spacy

In [7]:
nlp = spacy.load('en_core_web_sm')

In [8]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [10]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [11]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x15455cdc880>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x15455ea8dc0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x15455ea88e0>)]

In [12]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [15]:
doc2 = nlp(u"Tesla isn't  looking into startups anymore.")

In [16]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
  SPACE 
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [17]:
doc2

Tesla isn't  looking into startups anymore.

In [18]:
doc2[0]

Tesla

In [19]:
for i in doc2:
    print(i)

Tesla
is
n't
 
looking
into
startups
anymore
.


In [20]:
doc2[0].pos_

'PROPN'

In [21]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [22]:
life_quote = doc3[16:30]

In [23]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [25]:
type(life_quote)

spacy.tokens.span.Span

In [26]:
type(doc3)

spacy.tokens.doc.Doc

In [30]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [31]:
doc4

This is the first sentence. This is another sentence. This is the last sentence.

In [32]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [33]:
doc4[6].is_sent_start

True

In [35]:
doc9 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

In [36]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [37]:
from spacy import displacy

In [38]:
doc = nlp(u"Apple is going to build a UK factory for $6 million.")

In [39]:
displacy.render(doc,style='dep', jupyter = True, options = {'distance':110})

In [40]:
doc1 = nlp(u"Over the last quarter APple sold nearly 20 thousand ipods for a profit of $9 billion")

In [61]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [54]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [64]:
p_stemmer = PorterStemmer()


In [65]:
s_stemmer = SnowballStemmer(language='english')

In [66]:
for word in words:
    print(word + '  ------>' + p_stemmer.stem(word))

run  ------>run
runner  ------>runner
ran  ------>ran
runs  ------>run
easily  ------>easili
fairly  ------>fairli
fairness  ------>fair


In [67]:
for word in words:
    print(word + '  ------>' + s_stemmer.stem(word))

run  ------>run
runner  ------>runner
ran  ------>ran
runs  ------>run
easily  ------>easili
fairly  ------>fair
fairness  ------>fair


In [70]:
words2 = ['generous', 'generation', 'generate', 'generously', 'sagacious', 'generic', 'sage']

In [71]:
for word in words2:
    print(word + '  ------>' + s_stemmer.stem(word))

generous  ------>generous
generation  ------>generat
generate  ------>generat
generously  ------>generous
sagacious  ------>sagaci
generic  ------>generic
sage  ------>sage


In [72]:
import spacy

In [73]:
nlp = spacy.load('en_core_web_sm')

In [74]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [76]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [77]:
print(nlp.Defaults.stop_words)

{'with', 'regarding', 'where', 'becoming', 'cannot', 'thereby', 'beside', 'anywhere', 'whoever', 'often', 'how', 'last', 'such', 'is', 'ca', 'and', 'her', 'neither', 'we', 'yourselves', 'nowhere', 'they', 'into', 'this', 'everyone', 'afterwards', 'were', 'all', 'have', '’re', 'does', 'seemed', 'nobody', 'up', 'your', '‘d', '‘ve', 'six', 'too', '’ll', 'anything', 'done', 'after', 'move', 'meanwhile', 'when', 'be', 'hence', 'name', 'towards', 'perhaps', 'was', 'throughout', 'upon', 'wherever', "'m", 'many', 'under', 'none', 'mostly', 'above', 'whereafter', 'which', "'ll", 'make', 'else', 'part', 'some', 'several', 'keep', 'two', 'within', 'am', 'used', 'of', 'anyhow', 'well', 'enough', 'full', 'thence', 'either', 'less', 'thus', '‘re', 'both', 'at', 'just', 'back', 'might', 'everything', 'she', 'mine', 'our', 'thereafter', 'me', 'serious', 'hundred', 'through', 'since', 'empty', '‘s', 'via', 'nothing', 'latter', 'herein', 'must', 'thereupon', 'thru', 'what', 'he', 'few', 'now', 'everywhe

In [78]:
len(nlp.Defaults.stop_words)

326

In [79]:
nlp.vocab['mystery'].is_stop

False

In [80]:
nlp.Defaults.stop_words.add('btw')

In [81]:
nlp.vocab['btw'].is_stop = True

In [82]:
len(nlp.Defaults.stop_words)

327

In [84]:
nlp.vocab['btw'].is_stop

True

In [85]:
nlp.Defaults.stop_words.remove('beyond')

In [86]:
nlp.vocab['beyond'].is_stop = False

In [87]:
from spacy.matcher import Matcher

In [90]:
matcher = Matcher(nlp.vocab) # object instantiated

In [94]:
# SolarPower
# Solar-power
# Solar power
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'Is_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]
patterns = [pattern1, pattern2, pattern3]

In [96]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [99]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [103]:
found_matches = matcher(doc)

In [104]:
found_matches

[(8656102463236116519, 1, 3),
 (8656102463236116519, 10, 11),
 (8656102463236116519, 13, 16)]

In [105]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [106]:
matcher.remove('SolarPower')

In [108]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]
# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)

In [109]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [110]:
found_matches = matcher(doc2)

In [111]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [112]:
from spacy.matcher import PhraseMatcher

In [113]:
matcher = PhraseMatcher(nlp.vocab)

In [117]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [118]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle', 'free market']

In [119]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [121]:
phrase_patterns

[voodoo economics, supply-side economics, trickle, free market]

In [123]:
matcher.add('EconMatcher', None,*phrase_patterns)

In [128]:
found_matches = matcher(doc3)

In [131]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 50),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 644, 646),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2988),
 (3680293220734633682, 3012, 3013)]

In [133]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 50 trickle
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 644 646 free market
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2988 trickle
3680293220734633682 EconMatcher 3012 3013 trickle


In [134]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle', 'free market']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)

In [137]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-10:end+10]                    # get the matched span
    print(start,'\n', end, span.text)

41 
 45 during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo
49 
 50 associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents,
54 
 56 economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by
644 
 646 proposals as a return to the free enterprise principles, free market economy that had been in favor before the Great Depression
673 
 677 At the same time he attracted a following from the supply-side economics movement, which formed in opposition to Keynesian demand-
2987 
 2988 against institutions.[66] His policies became widely known as "trickle-down economics", due to the significant cuts
3012 
 3013 brackets, as that extra money for the wealthy could trickle along to low-income groups.[67]

Federal income


In [138]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [139]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [143]:
print((doc[4]).tag_)

VBD


In [144]:
print(doc[4].pos_)

VERB


In [145]:
import spacy

In [146]:
nlp = spacy.load('en_core_web_sm')

In [147]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [148]:
doc = nlp(u'Hi how are you?')

In [149]:
show_ents(doc)

No entities found


In [150]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

In [151]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [152]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [166]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

In [168]:
from spacy.matcher import PhraseMatcher

In [169]:
matcher = PhraseMatcher(nlp.vocab)

In [170]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [173]:
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_patterns

[vacuum cleaner, vacuum-cleaner]

In [180]:
matcher.add('newproduct',None, *phrase_patterns)
found_matches = matcher(doc)

In [181]:
from spacy.tokens import Span

In [182]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [183]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [184]:
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]

In [185]:
doc.ents = list(doc.ents) + new_ents

In [186]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [187]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.")

In [188]:
import spacy

In [189]:
nlp = spacy.load('en_core_web_sm')

In [190]:
from spacy import displacy

In [206]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million"
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [207]:
options = {"color":'black', "bg":'blue'}

displacy.render(doc, style='ent', jupyter=True, options=options)

In [208]:
displacy.render(doc, style='ent', jupyter=True, options=options)

In [209]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [218]:
colors = {'ORG':'red'}
options = {'ents':['PRODUCT', 'ORG'],'colors':colors}

In [219]:
displacy.render(doc, style='ent', jupyter=True, options=options)

In [220]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [222]:
doc1 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [223]:
doc1.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [224]:
for sent in doc.sents:
    print(sent)
    print('\n')

This is the first sentence.


This is another sentence.


This is the last sentence.




In [225]:
for sent in doc1.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [235]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [238]:
nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [239]:
doc[:-1]

This is the first sentence. This is another sentence. This is the last sentence

In [240]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [241]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker
