## Tokenization using Spacy

In [1]:
import spacy

In [6]:
#Create language object
nlp = spacy.blank("en")

doc = nlp("Dr. Smith, renowned for his groundbreaking research in neuroscience, will deliver the keynote address at the conference. After years of dedicated study, Sarah finally achieved her dream of becoming Dr. Johnson, earning her Ph.D. in astrophysics at a cost of $2000.")

In [7]:
#By default it makes word tokens
for token in doc:
    print(token)

Dr.
Smith
,
renowned
for
his
groundbreaking
research
in
neuroscience
,
will
deliver
the
keynote
address
at
the
conference
.
After
years
of
dedicated
study
,
Sarah
finally
achieved
her
dream
of
becoming
Dr.
Johnson
,
earning
her
Ph.D.
in
astrophysics
at
a
cost
of
$
2000
.


In [8]:
type(nlp)

spacy.lang.en.English

In [9]:
type(doc)

spacy.tokens.doc.Doc

In [10]:
type(token)

spacy.tokens.token.Token

In [12]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [13]:
len(doc)

48

In [20]:
token45 = doc[45]
token45

$

In [21]:
token45.is_currency

True

#### Extract Emails Exercise

In [25]:
with open("./public/students.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n']

In [26]:
text = ' '.join(text)
text



In [28]:
doc = nlp(text)
emails = []

for token in doc:
    if token.like_email:
        emails.append(token)

emails

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

#### Customize Tokenizer

In [29]:
doc2 = nlp("gimme double cheese extra large spicy pizza")

tokens = [token.text for token in doc2]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'spicy', 'pizza']

In [32]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme",[{ORTH:"gim"},{ORTH:"me"}])

In [33]:
doc2 = nlp("gimme double cheese extra large spicy pizza")

tokens = [token.text for token in doc2]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'spicy', 'pizza']

We can't change the actual text, but can split.

#### Add Components to Pipeline

In [35]:
doc3 = nlp("Dr. Smith, renowned for his groundbreaking research in neuroscience, will deliver the keynote address at the conference. After years of dedicated study, Sarah finally achieved her dream.")

for sentence in doc3.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [36]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x1cb7b566110>

In [37]:
nlp.pipe_names

['sentencizer']

In [39]:
doc3 = nlp("Dr. Smith, renowned for his groundbreaking research in neuroscience, will deliver the keynote address at the conference. After years of dedicated study, Sarah finally achieved her dream.")

for sentence in doc3.sents:
    print(sentence)

Dr. Smith, renowned for his groundbreaking research in neuroscience, will deliver the keynote address at the conference.
After years of dedicated study, Sarah finally achieved her dream.


#### Exercise

In [40]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [42]:
para = nlp(text)

In [47]:
urls = [token.text for token in para if token.like_url]

In [48]:
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [50]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

tokens = nlp(transactions)
for token in tokens:
    if token.like_num and tokens[token.i+1].is_currency:
        print(token,tokens[token.i+1].text)

two $
500 €
