In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc1 = nlp(u'Tesla is looking forward to buying a startup for $6 million')

In [11]:
for token in doc1:
    print(token.text, "--", token.pos, "--", token.pos_, "--", token.dep_)

Tesla -- 96 -- PROPN -- nsubj
is -- 87 -- AUX -- aux
looking -- 100 -- VERB -- ROOT
forward -- 86 -- ADV -- advmod
to -- 85 -- ADP -- prep
buying -- 100 -- VERB -- pcomp
a -- 90 -- DET -- det
startup -- 92 -- NOUN -- dobj
for -- 85 -- ADP -- prep
$ -- 99 -- SYM -- quantmod
6 -- 93 -- NUM -- compound
million -- 93 -- NUM -- pobj


In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x15f447fdfc0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x15f447fdba0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x15f445b5310>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x15f449b3d80>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x15f43b67f40>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x15f445b5380>)]

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [12]:
doc2 = nlp(u"Tesla isn't looking for startups anymore.")

In [13]:
for token in doc2:
    print(token.text, "--", token.pos, "--", token.pos_, "--", token.dep_)

Tesla -- 96 -- PROPN -- nsubj
is -- 87 -- AUX -- aux
n't -- 94 -- PART -- neg
looking -- 100 -- VERB -- ROOT
for -- 85 -- ADP -- prep
startups -- 92 -- NOUN -- pobj
anymore -- 86 -- ADV -- advmod
. -- 97 -- PUNCT -- punct


In [14]:
doc3 = nlp(u"This text is gonna contain spaces    lots of spaces      like really     !")

In [15]:
for token in doc3:
    print(token.text, "--", token.pos, "--", token.pos_, "--", token.dep_)

This -- 90 -- DET -- det
text -- 92 -- NOUN -- nsubj
is -- 87 -- AUX -- aux
gon -- 100 -- VERB -- ROOT
na -- 94 -- PART -- aux
contain -- 100 -- VERB -- xcomp
spaces -- 92 -- NOUN -- dobj
    -- 103 -- SPACE -- dep
lots -- 92 -- NOUN -- dobj
of -- 85 -- ADP -- prep
spaces -- 92 -- NOUN -- pobj
      -- 103 -- SPACE -- dep
like -- 85 -- ADP -- prep
really -- 86 -- ADV -- advmod
     -- 103 -- SPACE -- dep
! -- 97 -- PUNCT -- punct


In [16]:
doc2[0]

Tesla

In [17]:
doc2[0].pos_

'PROPN'

In [18]:
doc2[0].dep_

'nsubj'

In [19]:
doc2[0].tag_

'NNP'

# Spans

In [20]:
doc4 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [21]:
quote = doc4[16:30]

In [22]:
print(quote)

"Life is what happens to us while we are making other plans"


In [23]:
type(quote)

spacy.tokens.span.Span

In [24]:
type(doc4)

spacy.tokens.doc.Doc

In [25]:
doc5 = nlp(u"This is the first sentence. This is the second one. This is the last sentence.")

In [26]:
for sentence in doc5.sents:
    print(sentence)

This is the first sentence.
This is the second one.
This is the last sentence.


In [27]:
doc5[6]

This

In [28]:
doc5[6].is_sent_start

True

In [29]:
doc5[8].is_sent_start

False

# TOKENIZATION

Tokenization is the process of breaking up the original text into component pieces, a.k.a. tokens.

This is the first step in the spacy nlp pipeline

**Prefix:** Characters at the beginning ($, (, etc.)

**Suffix:** Characters at the end (!, ", etc.)

**Infix:** Characters in between 

**Exception:** Special case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied (let's, U.S, etc.)

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [1]:
mystring = '"We\'re moving to L.A.!"'

In [2]:
mystring

'"We\'re moving to L.A.!"'

In [3]:
print(mystring)

"We're moving to L.A.!"


In [6]:
sampledoc = nlp(mystring)

In [7]:
for token in sampledoc:
    print(token)

"
We
're
moving
to
L.A.
!
"


In [9]:
sampletext = "We're here to help! Drop an e-mail at support@oursite.com or visit our website https://www.aravind.com for help!"

In [10]:
sampledoc = nlp(sampletext)

In [11]:
for token in sampledoc:
    print(token)

We
're
here
to
help
!
Drop
an
e
-
mail
at
support@oursite.com
or
visit
our
website
https://www.aravind.com
for
help
!


In [12]:
sampledoc = nlp("A 5 km ride cost $10.55")

In [13]:
for t in sampledoc:
    print(t)

A
5
km
ride
cost
$
10.55


In [14]:
len(sampledoc)

7

In [15]:
sampledoc.vocab

<spacy.vocab.Vocab at 0x261a53081f0>

In [16]:
len(sampledoc.vocab)

786

In [18]:
sampledoc = nlp("Apple to build a gigafactory worth $6 billion in Hong Kong")

In [21]:
for token in sampledoc:
    print(token.text, end = " | ")

Apple | to | build | a | gigafactory | worth | $ | 6 | billion | in | Hong | Kong | 

In [27]:
for entity in sampledoc.ents:
    print(entity, entity.label, entity.label_, spacy.explain(entity.label_),end = "\n")

Apple 383 ORG Companies, agencies, institutions, etc.
$6 billion 394 MONEY Monetary values, including unit
Hong Kong 384 GPE Countries, cities, states


Noun chunks!

In [28]:
sampledoc = nlp("Autonomous cars shift insurance liabilities towards car manufacturers")

In [29]:
for chunks in sampledoc.noun_chunks:
    print(chunks)

Autonomous cars
insurance liabilities
car manufacturers


Tokenization viz!

In [30]:
from spacy import displacy

In [31]:
doc = nlp("Apple is going to build a UK factory for $5 billion")

In [34]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':70})

In [40]:
doc = nlp("During the last quarter, Apple sold nearly 20,000 units of iPods")

In [41]:
displacy.render(doc, style = 'ent', jupyter = True)

In [42]:
doc = nlp("This is a sentence.")

In [None]:
displacy.serve(doc, style = 'dep')

# Stemming

A crude method to catalog related words. Stemming does not work with languages like English that often consists of a lot of exceptions. Spacy does not include a stemmer.

In [45]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 6.4 MB/s eta 0:00:00
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 42.0/42.0 kB ? eta 0:00:00
Downloading regex-2023.12.25-cp310-cp310-win_amd64.whl (269 kB)
   ---------------------------------------- 269.5/269.5 kB 8.4 MB/s eta 0:00:00
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [46]:
import nltk
from nltk.stem.porter import PorterStemmer

In [47]:
pstemmer = PorterStemmer()

In [56]:
words = ["run", "runner", "ran", "running", "runs", "easily", "fairly", 'fairness']

In [57]:
for word in words:
    print(word + " -----> " + pstemmer.stem(word))

run -----> run
runner -----> runner
ran -----> ran
running -----> run
runs -----> run
easily -----> easili
fairly -----> fairli
fairness -----> fair


In [54]:
from nltk.stem.snowball import SnowballStemmer
snowstemmer = SnowballStemmer(language='english')

In [58]:
for word in words:
    print(word + " -----> " + snowstemmer.stem(word))

run -----> run
runner -----> runner
ran -----> ran
running -----> run
runs -----> run
easily -----> easili
fairly -----> fair
fairness -----> fair


In [61]:
anotherwords = ["generous", "generation", "generously", "generate"]

In [62]:
for word in anotherwords:
    print(word + " -----> " + snowstemmer.stem(word))

generous -----> generous
generation -----> generat
generously -----> generous
generate -----> generat


# Lemmatization

Lemma is the base form of the word. Instead of cutting words off, like in stemming, lemmatization overcomes the exceptions in languages. It uses vocabulary of languages and reduces the words to their base form.

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc1 = nlp(u"I am a runner running a race because I love to run since I was a little kid")

In [6]:
for token in doc1:
    print(token.text, "\t", token.pos_, "\t",  token.lemma_)

I 	 PRON 	 I
am 	 AUX 	 be
a 	 DET 	 a
runner 	 NOUN 	 runner
running 	 VERB 	 run
a 	 DET 	 a
race 	 NOUN 	 race
because 	 SCONJ 	 because
I 	 PRON 	 I
love 	 VERB 	 love
to 	 PART 	 to
run 	 VERB 	 run
since 	 SCONJ 	 since
I 	 PRON 	 I
was 	 AUX 	 be
a 	 DET 	 a
little 	 ADJ 	 little
kid 	 NOUN 	 kid


In [12]:
def lemmatize(text):
    for token in text:
        print(token.text, "   |   ", token.pos_, "   |   ",  token.lemma_)

In [8]:
doc2 = nlp(u"I saw multiple mice today. It was a horrific experience!")

In [13]:
lemmatize(doc2)

I    |    PRON    |    I
saw    |    VERB    |    see
multiple    |    ADJ    |    multiple
mice    |    NOUN    |    mouse
today    |    NOUN    |    today
.    |    PUNCT    |    .
It    |    PRON    |    it
was    |    AUX    |    be
a    |    DET    |    a
horrific    |    ADJ    |    horrific
experience    |    NOUN    |    experience
!    |    PUNCT    |    !


# Stop words

We do not want stop words like "A" and "The" and SpaCy has about 326 stop words that gets fitered out.

In [14]:
import spacy

In [15]:
nlp = spacy.load("en_core_web_sm")

In [16]:
print(nlp.Defaults.stop_words)

{'move', 'yourselves', 'out', 'towards', 'take', 'afterwards', 'elsewhere', 'enough', '’d', 'twelve', 'already', 'whereas', 'across', 'thereby', 'another', 'she', 'became', 'everywhere', 'her', 'therein', 'not', 'ca', 'that', 'moreover', 'too', 'wherever', 'three', "n't", 'on', '‘s', 'ten', 'those', 'seems', 'few', 'might', "'ll", 'into', 'using', 'am', 'whenever', 'over', 'with', 'than', 'sometime', 'whereby', 'now', 'fifteen', 'behind', 'ever', 'forty', 'within', 'sixty', 'itself', 'there', 'while', '’re', 'done', 'did', 'otherwise', 'everyone', 'i', 'either', 'where', 'why', 'upon', 'up', '‘d', 'when', 'yourself', 'these', 'back', 'in', 'less', 'must', 'yours', 'also', '’ll', 'a', 'throughout', 'most', 'them', 'it', 'some', 'without', 'was', 'n‘t', 'whence', 'hereafter', 'every', 'only', 'beside', 'nevertheless', 'my', 'besides', 'except', '’ve', '‘m', 'least', 'ours', 'around', 'hereupon', "'m", 'meanwhile', 'same', 'seeming', 'many', 'more', 'should', 'whole', 'yet', 'under', 'at'

Note that the above output is a set and not a dictionary

In [17]:
len(nlp.Defaults.stop_words)

326

In [18]:
nlp.vocab["bright"].is_stop

False

In [19]:
nlp.vocab["okay"].is_stop

False

In [20]:
nlp.vocab["and"].is_stop

True

We can also add stop words to the set

In [21]:
nlp.Defaults.stop_words.add("BTW")

In [22]:
len(nlp.Defaults.stop_words)

327

We can remove stop words from the set too

In [23]:
nlp.Defaults.stop_words.remove("beyond")

In [31]:
nlp.vocab["beyond"].is_stop = False

In [32]:
nlp.vocab["beyond"].is_stop

False

Lets add the word back in

In [26]:
nlp.vocab["beyond"].is_stop = True

In [28]:
nlp.vocab["beyond"].is_stop

True

In [33]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.Defaults.stop_words

# Phrase matching and vocabulary

## Rule-based Matching
spaCy offers a rule-matching tool called `Matcher` that allows you to build a library of token patterns, then match those patterns against a Doc object to return a list of found matches. You can match on any part of the token including text and annotations, and you can add multiple patterns to the same matcher.

In [34]:
import spacy

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
from spacy.matcher import Matcher

In [37]:
matcher = Matcher(nlp.vocab)

Pattern is a list of dictionaries

In [93]:
# creating patterns for Solarpower, Solar-power, and Solar Power
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {"IS_PUNCT": True}, {"LOWER": "power"}]
pattern3 = [{"LOWER": "solar"}, {"LOWER": "power"}]

In [104]:
matcher.add("Solarpower",patterns = [pattern1,pattern2,pattern3])

In [88]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [105]:
found_matches = matcher(doc)

In [106]:
print(found_matches)

[(6544436658971563323, 1, 3), (6544436658971563323, 10, 11), (6544436658971563323, 13, 16)]


In [107]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, start, end, span.text)

Solarpower 1 3 Solar Power
Solarpower 10 11 solarpower
Solarpower 13 16 Solar-power


In [102]:
matcher.remove("SolarPower")

Phrase matcher...better than the matcher

In [111]:
from spacy.matcher import PhraseMatcher

In [112]:
matcher = PhraseMatcher(nlp.vocab)

In [113]:
with open("reaganomics.txt") as f:
    doc3 = nlp(f.read())

In [114]:
phrases = ['vodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In this next step, we are converting each phrase to a document object. We are using a list comprehension to split this one doc to multiple doc objects

In [115]:
phrase_patterns = [nlp(text) for text in phrases] 

In [116]:
phrase_patterns

[vodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [118]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [120]:
matcher.add('EconMatcher',phrase_patterns)

In [121]:
found_matches = matcher(doc3)

In [122]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [124]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(string_id, start, end, span.text)

EconMatcher 41 45 supply-side economics
EconMatcher 49 53 trickle-down economics
EconMatcher 61 65 free-market economics
EconMatcher 673 677 supply-side economics
EconMatcher 2987 2991 trickle-down economics
