In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc1 = nlp(u'Tesla is looking forward to buying a startup for $6 million')

In [11]:
for token in doc1:
    print(token.text, "--", token.pos, "--", token.pos_, "--", token.dep_)

Tesla -- 96 -- PROPN -- nsubj
is -- 87 -- AUX -- aux
looking -- 100 -- VERB -- ROOT
forward -- 86 -- ADV -- advmod
to -- 85 -- ADP -- prep
buying -- 100 -- VERB -- pcomp
a -- 90 -- DET -- det
startup -- 92 -- NOUN -- dobj
for -- 85 -- ADP -- prep
$ -- 99 -- SYM -- quantmod
6 -- 93 -- NUM -- compound
million -- 93 -- NUM -- pobj


In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x15f447fdfc0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x15f447fdba0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x15f445b5310>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x15f449b3d80>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x15f43b67f40>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x15f445b5380>)]

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [12]:
doc2 = nlp(u"Tesla isn't looking for startups anymore.")

In [13]:
for token in doc2:
    print(token.text, "--", token.pos, "--", token.pos_, "--", token.dep_)

Tesla -- 96 -- PROPN -- nsubj
is -- 87 -- AUX -- aux
n't -- 94 -- PART -- neg
looking -- 100 -- VERB -- ROOT
for -- 85 -- ADP -- prep
startups -- 92 -- NOUN -- pobj
anymore -- 86 -- ADV -- advmod
. -- 97 -- PUNCT -- punct


In [14]:
doc3 = nlp(u"This text is gonna contain spaces    lots of spaces      like really     !")

In [15]:
for token in doc3:
    print(token.text, "--", token.pos, "--", token.pos_, "--", token.dep_)

This -- 90 -- DET -- det
text -- 92 -- NOUN -- nsubj
is -- 87 -- AUX -- aux
gon -- 100 -- VERB -- ROOT
na -- 94 -- PART -- aux
contain -- 100 -- VERB -- xcomp
spaces -- 92 -- NOUN -- dobj
    -- 103 -- SPACE -- dep
lots -- 92 -- NOUN -- dobj
of -- 85 -- ADP -- prep
spaces -- 92 -- NOUN -- pobj
      -- 103 -- SPACE -- dep
like -- 85 -- ADP -- prep
really -- 86 -- ADV -- advmod
     -- 103 -- SPACE -- dep
! -- 97 -- PUNCT -- punct


In [16]:
doc2[0]

Tesla

In [17]:
doc2[0].pos_

'PROPN'

In [18]:
doc2[0].dep_

'nsubj'

In [19]:
doc2[0].tag_

'NNP'

# Spans

In [20]:
doc4 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [21]:
quote = doc4[16:30]

In [22]:
print(quote)

"Life is what happens to us while we are making other plans"


In [23]:
type(quote)

spacy.tokens.span.Span

In [24]:
type(doc4)

spacy.tokens.doc.Doc

In [25]:
doc5 = nlp(u"This is the first sentence. This is the second one. This is the last sentence.")

In [26]:
for sentence in doc5.sents:
    print(sentence)

This is the first sentence.
This is the second one.
This is the last sentence.


In [27]:
doc5[6]

This

In [28]:
doc5[6].is_sent_start

True

In [29]:
doc5[8].is_sent_start

False

# TOKENIZATION

Tokenization is the process of breaking up the original text into component pieces, a.k.a. tokens.

This is the first step in the spacy nlp pipeline

**Prefix:** Characters at the beginning ($, (, etc.)

**Suffix:** Characters at the end (!, ", etc.)

**Infix:** Characters in between 

**Exception:** Special case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied (let's, U.S, etc.)

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [1]:
mystring = '"We\'re moving to L.A.!"'

In [2]:
mystring

'"We\'re moving to L.A.!"'

In [3]:
print(mystring)

"We're moving to L.A.!"


In [6]:
sampledoc = nlp(mystring)

In [7]:
for token in sampledoc:
    print(token)

"
We
're
moving
to
L.A.
!
"


In [9]:
sampletext = "We're here to help! Drop an e-mail at support@oursite.com or visit our website https://www.aravind.com for help!"

In [10]:
sampledoc = nlp(sampletext)

In [11]:
for token in sampledoc:
    print(token)

We
're
here
to
help
!
Drop
an
e
-
mail
at
support@oursite.com
or
visit
our
website
https://www.aravind.com
for
help
!


In [12]:
sampledoc = nlp("A 5 km ride cost $10.55")

In [13]:
for t in sampledoc:
    print(t)

A
5
km
ride
cost
$
10.55


In [14]:
len(sampledoc)

7

In [15]:
sampledoc.vocab

<spacy.vocab.Vocab at 0x261a53081f0>

In [16]:
len(sampledoc.vocab)

786

In [18]:
sampledoc = nlp("Apple to build a gigafactory worth $6 billion in Hong Kong")

In [21]:
for token in sampledoc:
    print(token.text, end = " | ")

Apple | to | build | a | gigafactory | worth | $ | 6 | billion | in | Hong | Kong | 

In [27]:
for entity in sampledoc.ents:
    print(entity, entity.label, entity.label_, spacy.explain(entity.label_),end = "\n")

Apple 383 ORG Companies, agencies, institutions, etc.
$6 billion 394 MONEY Monetary values, including unit
Hong Kong 384 GPE Countries, cities, states


Noun chunks!

In [28]:
sampledoc = nlp("Autonomous cars shift insurance liabilities towards car manufacturers")

In [29]:
for chunks in sampledoc.noun_chunks:
    print(chunks)

Autonomous cars
insurance liabilities
car manufacturers


Tokenization viz!

In [30]:
from spacy import displacy

In [31]:
doc = nlp("Apple is going to build a UK factory for $5 billion")

In [34]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':70})

In [40]:
doc = nlp("During the last quarter, Apple sold nearly 20,000 units of iPods")

In [41]:
displacy.render(doc, style = 'ent', jupyter = True)

In [42]:
doc = nlp("This is a sentence.")

In [None]:
displacy.serve(doc, style = 'dep')

# Stemming

A crude method to catalog related words. Stemming does not work with languages like English that often consists of a lot of exceptions. Spacy does not include a stemmer.

In [45]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 6.4 MB/s eta 0:00:00
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 42.0/42.0 kB ? eta 0:00:00
Downloading regex-2023.12.25-cp310-cp310-win_amd64.whl (269 kB)
   ---------------------------------------- 269.5/269.5 kB 8.4 MB/s eta 0:00:00
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [46]:
import nltk
from nltk.stem.porter import PorterStemmer

In [47]:
pstemmer = PorterStemmer()

In [56]:
words = ["run", "runner", "ran", "running", "runs", "easily", "fairly", 'fairness']

In [57]:
for word in words:
    print(word + " -----> " + pstemmer.stem(word))

run -----> run
runner -----> runner
ran -----> ran
running -----> run
runs -----> run
easily -----> easili
fairly -----> fairli
fairness -----> fair


In [54]:
from nltk.stem.snowball import SnowballStemmer
snowstemmer = SnowballStemmer(language='english')

In [58]:
for word in words:
    print(word + " -----> " + snowstemmer.stem(word))

run -----> run
runner -----> runner
ran -----> ran
running -----> run
runs -----> run
easily -----> easili
fairly -----> fair
fairness -----> fair


In [61]:
anotherwords = ["generous", "generation", "generously", "generate"]

In [62]:
for word in anotherwords:
    print(word + " -----> " + snowstemmer.stem(word))

generous -----> generous
generation -----> generat
generously -----> generous
generate -----> generat
