In [12]:
import spacy

In [13]:
nlp = spacy.load('en_core_web_sm') #load model called nlp

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million') #apply model to some text, doc holds processed text

In [7]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [8]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x1a5ea7b4ac8>),
 ('parser', <spacy.pipeline.DependencyParser at 0x1a5ea7b26a8>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x1a5ea7b2c48>)]

In [9]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [13]:
doc2 = nlp(u"Tesla isn't        looking into starups anymore.")

In [18]:
for token in doc2:
    print(token.text, token.pos_, token.dep_, token.is_stop)

Tesla PROPN nsubj False
is VERB aux True
n't ADV neg False
        SPACE  False
looking VERB ROOT False
into ADP prep True
starups NOUN pobj False
anymore ADV advmod False
. PUNCT punct False


In [16]:
doc2[0], doc2[0].pos_

(Tesla, 'PROPN')

In [17]:
doc2[0].dep_

'nsubj'

In [19]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [20]:
life_quote = doc3[16:30]

In [21]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [22]:
type(life_quote)

spacy.tokens.span.Span

In [23]:
type(doc3)

spacy.tokens.doc.Doc

In [24]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [25]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [26]:
doc4[6].is_sent_start

True

In [27]:
doc4[8].is_sent_start # doesn't return False

### Tokenization

In [28]:
mystring = '"We\'re moving to L.A.!"'

In [30]:
print(mystring)

"We're moving to L.A.!"


In [31]:
doc = nlp(mystring)

In [32]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [33]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [34]:
for token in doc2:
    print(token)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [35]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

In [36]:
for token in doc3:
    print(token)

A
5
km
NYC
cab
ride
costs
$
10.30


In [37]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [38]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [39]:
len(doc4)

11

In [41]:
len(doc4.vocab) # total size of the vocabulary of the loaded model

57852

In [45]:
doc5 = nlp(u'Apple to build a factory in Hong Kong for $6 million')

In [47]:
for token in doc5:
    print(token.text, end = " | ")

Apple | to | build | a | factory | in | Hong | Kong | for | $ | 6 | million | 

In [51]:
for entity in doc5.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))

Apple
ORG
Companies, agencies, institutions, etc.
Hong Kong
GPE
Countries, cities, states
$6 million
MONEY
Monetary values, including unit


In [52]:
doc6 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [53]:
for chunk in doc6.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [1]:
from spacy import displacy

In [7]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [10]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':75})

In [11]:
doc = nlp(u"Over the last quarter, Apple sold nearly 20 thousand iPods for a profit of $4 million.")

In [12]:
displacy.render(doc, style='ent', jupyter=True)

In [13]:
displacy.serve(doc, style='dep') # displays it in browser at listed port


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [17/Jun/2023 16:19:39] "GET / HTTP/1.1" 200 12899
127.0.0.1 - - [17/Jun/2023 16:19:40] "GET /favicon.ico HTTP/1.1" 200 12899
127.0.0.1 - - [17/Jun/2023 16:19:40] "GET /~@fontsource/roboto/400.css HTTP/1.1" 200 12899
127.0.0.1 - - [17/Jun/2023 16:19:40] "GET /~@fontsource/roboto/700.css HTTP/1.1" 200 12899
127.0.0.1 - - [17/Jun/2023 16:19:40] "GET /assets/images/clockify_logo_dark.svg HTTP/1.1" 200 12899



    Shutting down server on port 5000.



### Stemming

In [17]:
import nltk

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [18]:
from nltk.stem.porter import PorterStemmer

In [19]:
p_stemmer = PorterStemmer()

In [30]:
words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly', 'fairness', 'unfairly']

In [31]:
for word in words:
    print(word + '-----> ' + p_stemmer.stem(word))

run-----> run
runner-----> runner
running-----> run
ran-----> ran
runs-----> run
easily-----> easili
fairly-----> fairli
fairness-----> fair
unfairly-----> unfairli


In [32]:
from nltk.stem.snowball import SnowballStemmer

In [33]:
s_stemmer = SnowballStemmer(language='english')

In [34]:
for word in words:
    print(word + '-----> ' + s_stemmer.stem(word))

run-----> run
runner-----> runner
running-----> run
ran-----> ran
runs-----> run
easily-----> easili
fairly-----> fair
fairness-----> fair
unfairly-----> unfair


### Lemmatization

In [35]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [39]:
for token in doc1:
    print(f"{token.text:{10}}\t{token.pos_}\t\t{token.lemma_}")

I         	PRON		-PRON-
am        	VERB		be
a         	DET		a
runner    	NOUN		runner
running   	VERB		run
in        	ADP		in
a         	DET		a
race      	NOUN		race
because   	ADP		because
I         	PRON		-PRON-
love      	VERB		love
to        	PART		to
run       	VERB		run
since     	ADP		since
I         	PRON		-PRON-
ran       	VERB		run
today     	NOUN		today
.         	PUNCT		.


In [40]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [41]:
show_lemmas(doc1)

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        ADP    10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


### Stop words

In [42]:
print(nlp.Defaults.stop_words)

{'so', 'beyond', 'further', 'nor', 'regarding', 'see', 'anyway', 'many', 'therefore', 'whatever', 'to', 'do', 'never', 'seemed', 'least', 'made', 'wherever', 'else', 'who', 'if', 'forty', 'beforehand', 'because', 'beside', 'it', 'sometimes', 'all', 'very', 'make', 'afterwards', 'ourselves', 'quite', 'bottom', 'not', 'whenever', 'something', 'upon', 'whose', 'once', 'before', 'eight', 'during', 'however', 'hereupon', 'doing', 'out', 'per', 'more', 'from', 'cannot', 'its', 'become', 'thereupon', 'anyone', 'were', 'whence', 'latter', 'and', 'no', 'here', 'itself', 'due', 'herself', 'should', 'almost', 'three', 'me', 'between', 'several', 'meanwhile', 'of', 'also', 'others', 'being', 'about', 'same', 'please', 'another', 'serious', 'what', 'well', 'they', 'became', 'besides', 'nevertheless', 'whom', 'since', 'say', 'at', 'first', 'get', 'anywhere', 'below', 'ours', 'have', 'anything', 'may', 'two', 'anyhow', 'thus', 'moreover', 'such', 'seeming', 'towards', 'in', 'perhaps', 'his', 'is', 't

In [45]:
nlp.vocab['me'].is_stop

True

In [46]:
nlp.Defaults.stop_words.add('btw')

In [47]:
nlp.vocab['btw'].is_stop = True

In [48]:
nlp.Defaults.stop_words

{'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'btw',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'gi

In [49]:
nlp.Defaults.stop_words.remove('beyond')

In [50]:
nlp.vocab['beyond'].is_stop = False

### Phrase Matching and Vocabulary

In [14]:
from spacy.matcher import Matcher

In [15]:
matcher = Matcher(nlp.vocab)

In [23]:
# SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# Solar-power
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True}, {'LOWER':'power'}]
# Solar power
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [24]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [25]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [26]:
found_matches = matcher(doc)

In [27]:
found_matches

[(8656102463236116519, 1, 3),
 (8656102463236116519, 10, 11),
 (8656102463236116519, 13, 16)]

In [28]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [29]:
matcher.remove('SolarPower')

In [30]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}] #match punct zero or more times

In [31]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [41]:
doc2 = nlp(u'Solar--power is solarpower, silly sentence')

In [42]:
found_matches = matcher(doc2)

In [43]:
found_matches

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]

Phrase matching. Don't really understand why this is any better than regex or simple string matching :-)

In [44]:
from spacy.matcher import PhraseMatcher

In [46]:
matcher = PhraseMatcher(nlp.vocab) 

In [47]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [48]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [49]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [50]:
matcher.add("EconMatcher", None, *phrase_patterns)

In [51]:
found_matches = matcher(doc3)

In [52]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [53]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2985 2989 trickle-down economics
