<h2 align="center">Spacy Tokenization Tutorial</h2>

In [1]:
import spacy

Create blank language object and tokenize words in a sentence

In [2]:
nlp = spacy.blank('en')

doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


Creating blank language object gives a tokenizer and an empty pipeline. We will look more into language pipelines in next tutorial

<h3>Using index to grab tokens</h3>

In [3]:
doc[0]

Dr.

In [4]:
token = doc[1]
token.text

'Strange'

<h3>Span object</h3>

In [10]:
span = doc[0:5]
span

Dr. Strange loves pav bhaji

<h3>Token attributes</h3>

In [12]:
doc = nlp("Tony gave two $ to Peter.")

In [13]:
token0 = doc[0]
token0

Tony

In [14]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [15]:
token0.is_alpha

True

In [16]:
token0.like_num

False

In [17]:
token2 = doc[2]
token2

two

In [18]:
token2.like_num

True

In [36]:
token3 = doc[3]
token3

$

In [37]:
token3.like_num

False

In [38]:
token3.is_currency

True

In [12]:
for token in doc:
    print(token, "==>", 
          "index: ", token.i, 
          "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency, )
    print('')

Dr. ==> index:  0 is_alpha: False is_punct: False like_num: False is_currency: False

Strange ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False

loves ==> index:  2 is_alpha: True is_punct: False like_num: False is_currency: False

pav ==> index:  3 is_alpha: True is_punct: False like_num: False is_currency: False

bhaji ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False

of ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False

mumbai ==> index:  6 is_alpha: True is_punct: False like_num: False is_currency: False

as ==> index:  7 is_alpha: True is_punct: False like_num: False is_currency: False

it ==> index:  8 is_alpha: True is_punct: False like_num: False is_currency: False

costs ==> index:  9 is_alpha: True is_punct: False like_num: False is_currency: False

only ==> index:  10 is_alpha: True is_punct: False like_num: False is_currency: False

2 ==> index:  11 is_alpha: False is_punct: False lik

<h3>Collecting email ids of students from students information sheet</h3>

In [17]:
with open("students.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [18]:
text = " ".join(text)
text



In [20]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails        

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

<h3>Support in other languages</h3>

Spacy support many language models. Some of them do not support pipelines though!
https://spacy.io/usage/models#languages

In [21]:
nlp = spacy.blank("hi")
doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
for token in doc:
    print(token, token.is_currency, token.like_num)

भैया False False
जी False False
! False False
5000 False True
₹ True False
उधार False False
थे False False
वो False False
वापस False False
देदो False False


<h3>Customizing tokenizer</h3>

In [23]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [29]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

<h3>Sentence Tokenization or Segmentation</h3>

In [None]:
nlp.pipe_names

In [None]:
nlp.add_pipe('sentencizer')

In [None]:
nlp.pipe_names

In [38]:
# nlp = spacy.blank('en')

doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi
