In [1]:
import spacy
from spacy import displacy

TOKENIZATION - is the process of breaking up the original text into component pieces called tokens.

It is one of the first steps in NLP process.

It mainly breaks the strings based on prefixes, suffixes, infixes and exceptions

In [4]:
nlp = spacy.load('en_core_web_sm')

In [3]:
my_str = '"We\'re moving to L.A!"'
print(my_str)

"We're moving to L.A!"


In [5]:
doc = nlp(my_str)

In [7]:
print(doc.text)

"We're moving to L.A!"


In [8]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


In [9]:
len(doc)

8

In [10]:
len(doc.vocab)  # en_core_web_sm as that many different types of tokens

768

In [14]:
# Indexing and slicing of tokens

print(doc[5])
print(doc[1:5])

L.A
We're moving to


In [19]:
# Named entities are accessible through ents property/attribute

doc2 = nlp(u"Apple to build a HongKong factory for $6 million")
print(doc2.text)

Apple to build a HongKong factory for $6 million


In [21]:
for token in doc2:
    print(token.text, end=' | ')

Apple | to | build | a | HongKong | factory | for | $ | 6 | million | 

In [27]:
for entity in doc2.ents:
    print(entity)           # lists the named entities in a string
    print(entity.label_)    # lists the label to which the named entities belongs to 
    print(str(spacy.explain(entity.label_)))    # provides an explanation for each of the labels
    print("\n")

Apple
ORG
Companies, agencies, institutions, etc.


HongKong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [28]:
# Noun Chunks are basically the noun-phrases

for chunk in doc2.noun_chunks:
    print(chunk)

Apple
a HongKong factory


### VISUALIZATION OF TOKENS

displacy can run on servers when running outside of jupyter notebooks and inside the jupyter notebooks directly

In [32]:
doc3 = nlp(u"Apple is going to build a UK factory for $6 million")
print(doc3.text)

Apple is going to build a UK factory for $6 million


In [33]:
for token in doc3:
    print(token.text)

Apple
is
going
to
build
a
UK
factory
for
$
6
million


In [46]:
displacy.render(doc3, style='dep', jupyter=True, options={'distance':110, 'bg':"black", 'color':"yellow"})

# dep ---> syntactic dependency
# distnace -> distances between each of the tokens

In [38]:
displacy.render(doc3, style='ent', jupyter=True)
