## Tokenization

In [1]:
import spacy

### creation of blank language object

In [2]:
nlp = spacy.blank("en") 

<img src="spacy_blank_pipeline.jpg" height=100, width=500/>

### creation of document (paragraph)

In [3]:
text = "Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate"

In [43]:
doc = nlp(text)

In [44]:
for token in doc: # this code also tokenize
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate


In [7]:
doc[0]

Dr.

In [8]:
sentence = "Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate"

In [9]:
sentence.split() #2$ olarak aldı token, it is not smart enough to detect language. 

['Dr.',
 'Strange',
 'loves',
 'pav',
 'bhaji',
 'of',
 'mumbai',
 'as',
 'it',
 'costs',
 'only',
 '2$',
 'per',
 'plate']

In [10]:
sentence = '''"Let's go to N.Y.!"'''
sentence.split()

['"Let\'s', 'go', 'to', 'N.Y.!"']

In [11]:
nlp = spacy.blank("en")
doc = nlp(sentence)

In [12]:
for token in doc:
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [13]:
type(nlp) #it is object of english language

spacy.lang.en.English

In [14]:
type(doc)

spacy.tokens.doc.Doc

In [15]:
type(token)

spacy.tokens.token.Token

In [16]:
span = doc[1:5]
type(span)

spacy.tokens.span.Span

In [17]:
doc = nlp("Tony gave two $ to Peter")

In [18]:
token0 = doc[0]
token0

Tony

In [19]:
dir(token0) # methods of the class-token0

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [20]:
token0.is_alpha

True

In [21]:
token0.like_num

False

In [22]:
token2= doc[2]
token2

two

In [23]:
token2.like_num # smart spacy

True

In [24]:
token3= doc[3]
token3

$

In [25]:
token3.is_currency

True

In [26]:
doc

Tony gave two $ to Peter

In [27]:
for i in doc:
    print(i, "==>", "index:", i.i, "is_alpha:", i.is_alpha,", "
                                     "is_punct:", i.is_punct, ", "
                                     "like_num:", i.like_num, ", "
                                     "is_currency:", i.is_currency)

Tony ==> index: 0 is_alpha: True , is_punct: False , like_num: False , is_currency: False
gave ==> index: 1 is_alpha: True , is_punct: False , like_num: False , is_currency: False
two ==> index: 2 is_alpha: True , is_punct: False , like_num: True , is_currency: False
$ ==> index: 3 is_alpha: False , is_punct: False , like_num: False , is_currency: True
to ==> index: 4 is_alpha: True , is_punct: False , like_num: False , is_currency: False
Peter ==> index: 5 is_alpha: True , is_punct: False , like_num: False , is_currency: False


In [30]:
with open("C:/Users/Owner/nlp-tutorials/4_tokenization/Exercise/students.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [31]:
text = " ".join(text)
text



In [32]:
doc = nlp(text)

In [33]:
email = []

In [35]:
for token in doc:
    if token.like_email:
        email.append(token)
email   

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

In [None]:
[token for token in doc if token.like_email]

### customizing tokenizer

In [74]:
doc = nlp("gimme double cheese extra large healthy pizza")

In [75]:
doc.text # Create a new string object from the given object. 

'gimme double cheese extra large healthy pizza'

In [65]:
doc[0].text

'gimme'

In [76]:
[i.text for i in doc] # get the tokens with list comprehension

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [77]:
from spacy.symbols import ORTH

In [82]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])


doc = nlp("gimme double cheese extra large healthy pizza")

[i.text for i in doc]

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

### Sentence tokenizer

In [89]:
doc = nlp("Dr.Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

In [90]:
for i in doc.sents: # saying my nlp pipeline is blank. 
    print(i)

Dr.Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [91]:
nlp.pipe_names # pipeline is blank. What is that pipeline means?

['sentencizer']

In [95]:
#nlp.add_pipe("sentencizer") # I need to add sentencizer component to blank pipeline. now this nlp object knows how to split;

In [94]:
nlp.pipe_names

['sentencizer']

In [96]:
for i in doc.sents: 
    print(i)

Dr.Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf) 

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [147]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here
# Hint: token has an attribute that can be used to detect a url

In [148]:
nlp = spacy.blank("en")

In [149]:
doc = nlp(text)

In [152]:
emails = []
for i in doc:
    if i.like_url:
        emails.append(i)
emails

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [153]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here
# Hint: Use token.i for the index of a token and token.is_currency for currency symbol detection

In [154]:
doc = nlp(transactions)

In [156]:
for i in doc:
    print(i)

Tony
gave
two
$
to
Peter
,
Bruce
gave
500
€
to
Steve


In [161]:
type(doc)

spacy.tokens.doc.Doc

In [162]:
token0 = doc[0]

In [163]:
token0

Tony

In [164]:
token0.is_currency

False

In [168]:
cur = []
for i in doc:
    if (i.is_currency is True)| (i.like_num is True):
        cur.append(i)
cur

[two, $, 500, €]

In [169]:
for i in doc:
    if i.like_num and doc[i.i+1].is_currency:
        print(i.text, doc[i.i+1].text)

two $
500 €


In [170]:
doc[0]

Tony