# NLP Processing - Tokenization

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
mystring = "You\'re in a hurry for work in U.K.!"
print(mystring)

You're in a hurry for work in U.K.!


In [4]:
doc = nlp(mystring)
for token in doc:
    print(token.text)

You
're
in
a
hurry
for
work
in
U.K.
!


In [5]:
mystring2 = "Let's have a user-based experience on the 1st demo of our site https://www.rhsp.com!"
print(mystring2)

Let's have a user-based experience on the 1st demo of our site https://www.rhsp.com!


In [6]:
doc = nlp(mystring2)
for token in doc:
    print(token.text, end=' | ')

Let | 's | have | a | user | - | based | experience | on | the | 1st | demo | of | our | site | https://www.rhsp.com | ! | 

In [7]:
mystring3 = "Achilles confronts Paris about what Hector died for, Troy or honor?"
print(mystring3)

Achilles confronts Paris about what Hector died for, Troy or honor?


In [8]:
doc1 = nlp(mystring3)
# for named entities
for entity in doc1.ents:
    print(entity, "\t=", spacy.explain(entity.label_))

Paris 	= Countries, cities, states
Hector 	= People, including fictional


In [9]:
doc2 = nlp(u"Google is not investing $300 million dollar for Taiwan-based stratups")

for entity in doc2.ents:
    print(f"{entity.text:<20} = {entity.label_} = {spacy.explain(entity.label_)}")

Google               = ORG = Companies, agencies, institutions, etc.
$300 million dollar  = MONEY = Monetary values, including unit
Taiwan               = GPE = Countries, cities, states


In [10]:
# noun chunks
for chunk in doc1.noun_chunks:
    print(chunk)

Achilles confronts
what
Hector
Troy
honor


In [11]:
for chunk in doc2.noun_chunks:
    print(chunk)

Google
$300 million dollar
Taiwan-based stratups
