# Tokenization in Spacy: NLP Tutorial For Beginners - 8

https://www.youtube.com/watch?v=_lR3RjvYvF4&list=PLeo1K3hjS3uuvuAXhYjV2lMEShq2UYSwX&index=8

## word tokenization

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

doc

Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.

In [3]:
type(doc)

spacy.tokens.doc.Doc

In [4]:
doc[0]

Dr.

In [5]:
type(doc[0])

spacy.tokens.token.Token

In [6]:
doc[0:2]

Dr. Strange

In [7]:
type(doc[0:2])

spacy.tokens.span.Span

In [8]:
type(nlp)

spacy.lang.en.English

In [9]:
[_ for _ in doc]

[Dr.,
 Strange,
 loves,
 pav,
 bhaji,
 of,
 mumbai,
 as,
 it,
 costs,
 only,
 2,
 $,
 per,
 plate,
 .]

### methods applicable to spacy.tokens.token.Token object

In [10]:
dir(doc[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [11]:
doc[0]

Dr.

In [12]:
doc[0].is_currency

False

In [13]:
doc[0].is_digit

False

#### let's leave only digit tokens

In [14]:
[_ for _ in doc if _.is_digit]

[2]

#### let's leave only tokens which contain alphabetic symbols

In [15]:
[_ for _ in doc if _.is_alpha]

[Strange, loves, pav, bhaji, of, mumbai, as, it, costs, only, per, plate]

#### finding numerical values (both digits and written)

In [16]:
doc = nlp("Tony has two (2) dollars ($) and one euro (€)")

In [17]:
[_.text for _ in doc if _.like_num]

['two', '2', 'one']

#### finding currency symbols

In [18]:
[_.text for _ in doc if _.is_currency]

['$', '€']

#### checking tokens for different types

In [19]:
for token in doc:
    print(token, "==>", 
          "index: ", token.i, # !!! index of token !!!
          "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
has ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
( ==> index:  3 is_alpha: False is_punct: True like_num: False is_currency: False
2 ==> index:  4 is_alpha: False is_punct: False like_num: True is_currency: False
) ==> index:  5 is_alpha: False is_punct: True like_num: False is_currency: False
dollars ==> index:  6 is_alpha: True is_punct: False like_num: False is_currency: False
( ==> index:  7 is_alpha: False is_punct: True like_num: False is_currency: False
$ ==> index:  8 is_alpha: False is_punct: False like_num: False is_currency: True
) ==> index:  9 is_alpha: False is_punct: True like_num: False is_currency: False
and ==> index:  10 is_alpha: True is_punct: False like_num: False is_currency: False
one ==> index:  11 is_alpha: True is_punct: False like_num: True is_currency: False

#### finding emails

In [20]:
doc = "Our emails are: slavoj.zizek@yahoo.com and borat@mail.kz!!!"

doc = nlp(doc)

doc

Our emails are: slavoj.zizek@yahoo.com and borat@mail.kz!!!

In [21]:
[_.text for _ in doc if _.like_email]  # _.text mean that we need only string, not token (spacy.tokens.token.Token)

['slavoj.zizek@yahoo.com', 'borat@mail.kz']

## sentence tokenization

In [22]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. It costs only 2$ per plate.")

# [_ for _ in doc.sents]

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [23]:
nlp.pipe_names

[]

In [24]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x1a0038e3440>

In [25]:
nlp.pipe_names

['sentencizer']

In [26]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. It costs only 2$ per plate.")

[_ for _ in doc.sents]

[Dr. Strange loves pav bhaji of mumbai., It costs only 2$ per plate.]

# !!! finding French numerical values (both digits and written) !!!

In [27]:
nlp = spacy.blank("fr")

doc = nlp("un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix, onze, douze, soixante quatre (64) xxx")

doc

un, deux, trois, quatre, cinq, six, sept, huit, neuf, dix, onze, douze, soixante quatre (64) xxx

In [28]:
[_.text for _ in doc if _.like_num]

['un',
 'deux',
 'trois',
 'quatre',
 'cinq',
 'six',
 'sept',
 'huit',
 'neuf',
 'dix',
 'onze',
 'douze',
 'soixante',
 'quatre',
 '64']