Tokenization is the process of **breaking up original text into small units (tokens)**

Tokenization does this task by locating word boundaries, Ending point of a word and beginning of the next word is called word boundary 



# 1) Tokenization Basics

In [0]:
# Import spaCy
import spacy

In [0]:
# load the English language library
nlp = spacy.load(name='en_core_web_sm')

In [3]:
# Define a string
text = ('Apple is looking at buying U.K. startup for $1 billion !')
print(text)

Apple is looking at buying U.K. startup for $1 billion !


In [0]:
# Create a doc object and explore tokens
doc = nlp(text)

In [5]:
for token in doc:
  print(token.text)
# every single unit is a token here, including doller symbol, number and exclamation mark
# spaCy will isolate punctuation that does not form an integral part of a word
# like Quotation marks, commas and punctuation, they will be assigned their own token

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
!


In [6]:
# However, punctuation that exists as part of an email address, website or numerical value will be kept as part of the token

doc_2 = nlp('Hello all, We are here to help you! email support@udemy.com or visit us at http://www.udemy.com!')

for token in doc_2:
    print(token.text)

Hello
all
,
We
are
here
to
help
you
!
email
support@udemy.com
or
visit
us
at
http://www.udemy.com
!


In [7]:
doc_3 = nlp('10km cab ride almost costs $20 in NYC')

for token in doc_3:
    print(token.text)

# here the distance unit (km) and doller sign assigned their own token

10
km
cab
ride
almost
costs
$
20
in
NYC


In [8]:
# Punctuation that exists as part of a known abbreviation will be kept as part of the token

doc_4 = nlp("Let's watch a movie together.")

for token in doc_4:
    print(token.text)

Let
's
watch
a
movie
together
.


In [9]:
# Counting Tokens
len(doc_4)

7

In [10]:
# Counting Vocab Entries
len(nlp.vocab)
# that means when we loaded up 'en_core_web_sm' library, that has a vocabulary 511 different types of tokens 

511

# 2) Indexing and slicing in tokens

In [11]:
doc5 = nlp("It's a beautiful day outside there!")

for token in doc5:
    print(token.text)

It
's
a
beautiful
day
outside
there
!


In [12]:
len(doc5)

8

In [13]:
doc5[0] # indexing

It

In [14]:
doc5[0:5] # slicing

It's a beautiful day

In [15]:
doc5[-2] # reverse indexing

there

In [16]:
doc5[0] = 'New' # Tokens can't be reassigned

TypeError: ignored

# 3) Named Entities

In [17]:
# Named Entities

doc_6 = nlp('Apple is looking at buying U.K. startup for $1 billion')

for token in doc_6:
  print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [18]:
for ent in doc_6.ents:
  print(ent)
# Spacy is able to recognize Apple, U.K. and $1 billion are the named entities
# these named entities are giving more information

Apple
U.K.
$1 billion


In [19]:
# we can also print the labels, and details about the entity

for ent in doc_6.ents:
  print(ent)                             # printing entity
  print(ent.label_)                      # printing entity label
  print(str(spacy.explain(ent.label_)))  # printing entity label details
  print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


U.K.
GPE
Countries, cities, states


$1 billion
MONEY
Monetary values, including unit




# 4) Noun Chunks

In [0]:
# Noun chunks are very similar to doc.ents
# In simple words we can say that, noun + words describing a particular noun

In [21]:
doc_7 = nlp("Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc_7.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


# 5) Built-in Visualizers

In [0]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Apple is looking at buying U.K. startup for $1 billion !")

In [23]:
displacy.render(docs=doc,style='dep',jupyter=True,options={'distance':80})
# distance is the distance between tokens

In [24]:
# Visualizing the entity recognizer

doc = nlp("Apple is looking at buying U.K. startup for $1 billion !")
displacy.render(docs=doc, style='ent', jupyter=True)