# Spacy tokenization


In [1]:
import spacy

In [4]:
nlp = spacy.blank('en')  # to know what you use here go to spacy.io/usage/models
doc = nlp('Dr. Aghaei loves Tahchin, and it cost not that much, around 5 $') # privide the text here
for token in doc:
    print(token)

Dr.
Aghaei
loves
Tahchin
,
and
it
cost
not
that
much
,
around
5
$


In [5]:
type(nlp)

spacy.lang.en.English

In [6]:
type(token)

spacy.tokens.token.Token

In [13]:
doc[1:7]

Aghaei loves Tahchin, and it

In [20]:
span = doc[0]
print(span)
type(span)

Dr.


spacy.tokens.token.Token

In [17]:
dir(span)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [22]:
span.is_title


True

## read the file  and extract the emails from the text

In [27]:
with open ('file.text') as f:
    text = f.readlines()
text = ''.join(text)
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails


FileNotFoundError: [Errno 2] No such file or directory: 'file.text'

## Read a persian text and customize tokenization

In [29]:
import spacy

# Create a blank pipeline for Persian
nlp = spacy.blank('fa')  # Use 'fa' for Farsi (Persian), NOT 'fr'

# Persian text example
text = ' سلام مُن این را  خریدم ده تومان  '

# Tokenization
doc = nlp(text)

# Display token information
for token in doc:
    print(token, token.text, token.is_currency)


    False
سلام سلام False
مُن مُن False
این این False
را را False
    False
خریدم خریدم False
ده ده False
تومان تومان False
    False


In [30]:
from spacy.tokens import Token

# Register custom extension
Token.set_extension("is_currency", default=False, force=True)

# Add rule for "تومان"
for token in doc:
    if token.text.strip() == "تومان":
        token._.is_currency = True
    print(token.text, token._.is_currency)


  False
سلام False
مُن False
این False
را False
  False
خریدم False
ده False
تومان True
  False


## Detect the URLs from the following text

In [39]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
import spacy 
nlp = spacy.blank('en')
doc = nlp(text)
urls = []
for token in doc:
    if token.like_url:
        urls.append(token.text)
# or simply write urls = [token.text for token in doc if token.like_url]
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

## Extract all money transaction from below sentence along with currency.

In [38]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

In [40]:
doc = nlp(transactions)
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text) 

two $
500 €
