## Tokenization with Spacy

In [1]:
import spacy

In [18]:
nlp = spacy.blank("en")

In [33]:
doc = nlp("X love meat kebab with cheese as it cost 2$ each.")

for token in doc:
  print(token)

X
love
meat
kebab
with
cheese
as
it
cost
2
$
each
.


In [32]:
doc = nlp("Let's go to N.Y.!")
for token in doc:
  print(token)

Let
's
go
to
N.Y.
!


In [23]:
type(nlp)

In [24]:
type(doc)

spacy.tokens.doc.Doc

In [29]:
type(token) # check methods : dir(token)

spacy.tokens.token.Token

In [30]:
for token in doc:
    print(token, "==>", "index: ", token.i,
          "is_alpha:", token.is_alpha,
          "is_punct:", token.is_punct,
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
          )

Let ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
's ==> index:  1 is_alpha: False is_punct: False like_num: False is_currency: False
go ==> index:  2 is_alpha: True is_punct: False like_num: False is_currency: False
to ==> index:  3 is_alpha: True is_punct: False like_num: False is_currency: False
N.Y. ==> index:  4 is_alpha: False is_punct: False like_num: False is_currency: False
! ==> index:  5 is_alpha: False is_punct: True like_num: False is_currency: False


In [37]:
with open("student.txt") as f:
  text = f.readlines()
text

['_ high school, 12th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'A   5 June, 1882    A@example.com\n',
 'B\t12 April, 2001  B@hotmail.com\n',
 'C  24 June, 1998   C@gmail.com \n',
 'D      1 May, 1997    D@tb.com']

In [38]:
text = " ".join(text)
text



In [40]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['A@example.com', 'B@hotmail.com', 'C@gmail.com', 'D@tb.com']

### Customizing tokenizer

In [43]:
nlp = spacy.blank("en")
doc = nlp("Give me double cheese extra large kebab")
tokens = [token.text for token in doc]
tokens

['Give', 'me', 'double', 'cheese', 'extra', 'large', 'kebab']

In [46]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gibme", [
    {ORTH: "gib"},
    {ORTH: "me"},
])

doc = nlp("gibme double cheese extra large kebab")
tokens = [token.text for token in doc]
tokens

['gib', 'me', 'double', 'cheese', 'extra', 'large', 'kebab']

### Sentence Tokenization

In [47]:
doc = nlp("X love meat kebab etc. with cheese. Y love cheeseburger.")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [49]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x78f141b34510>

In [50]:
nlp.pipe_names

['sentencizer']

In [51]:
doc = nlp("X love meat kebab etc. with cheese. Y love cheeseburger.")
for sentence in doc.sents:
    print(sentence)

X love meat kebab etc.
with cheese.
Y love cheeseburger.
