![alt text](image-5.png)

In [3]:
import spacy

# Basics of SpaCy

![alt text](image-6.png)
![alt text](image-7.png)

In [8]:
nlp = spacy.blank("en")
doc = nlp("Dr.Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [10]:
type(nlp)

spacy.lang.en.English

In [11]:
type(doc)

spacy.tokens.doc.Doc

In [13]:
type(token)

spacy.tokens.token.Token

In [15]:
doc = nlp("Tony gave two $ to peter")

In [18]:
token0 = doc[0]
token0
dir(token0) # to get all the attributes and methods of token0

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [23]:
print(token0.is_alpha)
print(token0.like_num)

True
False


In [26]:
token2 = doc[2]
print(token2)
print(token2.like_num) #spacy is smart it can identify two as number

two
True


In [27]:
for token in doc:
    print(token, "==>" ,"index: ",token.i,
          "is_alpha:", token.is_alpha,
          "is_punct:", token.is_punct,
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,)

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
peter ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False


# An application of SpaCy

Assume we want to send emails to students in this text (3.students.txt). It is okay to do it manually if the number of students present in the text is small. But if it's large, it's very inconvenient to do it manually. To solve this, we can use Regular Expressions (regex). In this section, let's consider how to achieve this task using `spaCy`.



![alt text](image-8.png)


In [52]:
with open ("3. students.txt") as f:
    text = f.readlines()
    print(text)



In [53]:
text = " ".join(text)
text



In [54]:
doc = nlp(text)
emails = []

for token in doc:
    if token.like_email:
        emails.append(token.text)

emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

# customizing spaCy


ex: on default spacy takes gimme as a single word. But it is two words like Give me. In such cases we can customize spaCy

In [45]:
doc = nlp ("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [51]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [{ORTH: "give"}, {ORTH: "me"}]) #tokanization can't modify the text it can only split the text

doc = nlp ("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

ValueError: [E997] Tokenizer special cases are not allowed to modify the text. This would map 'gimme' to 'giveme' given token attributes '[{65: 'give'}, {65: 'me'}]'.

In [50]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [{ORTH: "gim"}, {ORTH: "me"}]) 

doc = nlp ("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Sentence Tokenization


In [55]:
doc = nlp("Dr.Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [58]:
nlp.pipe_names

[]

In [59]:
nlp.add_pipe("sentencizer")


<spacy.pipeline.sentencizer.Sentencizer at 0x17c55985350>

![alt text](image-9.png)

In [60]:
doc = nlp("Dr.Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

for sentence in doc.sents:
    print(sentence)

Dr.Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi
