In [1]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Dr. Fahad is a brilliant teacher. He is currently on a semmester break.")

In [6]:
for token in doc:
    print(token)

Dr.
Fahad
is
a
brilliant
teacher
.
He
is
currently
on
a
semmester
break
.


In [7]:
doc[5]

teacher

In [8]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [9]:
type(nlp)

spacy.lang.en.English

In [10]:
type(doc)

spacy.tokens.doc.Doc

In [11]:
type(token)

spacy.tokens.token.Token

In [13]:
span = doc[0:7]
span

Dr. Fahad is a brilliant teacher.

In [14]:
type(span)

spacy.tokens.span.Span

In [18]:
doc = nlp("Harry gave 100 $ to ryan.")

In [16]:
token0 = doc[0]
token0

Harry

In [17]:
token0.is_alpha

True

In [20]:
token3 = doc[3]
token3

$

In [21]:
token3.like_num

False

In [22]:
token3.is_currency

True

In [23]:
for token in doc:
    print(token, "==>", "index: ", token.i, 
          "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Harry ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
100 ==> index:  2 is_alpha: False is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
ryan ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  6 is_alpha: False is_punct: True like_num: False is_currency: False


### Collecting emails of students from students txt file

In [34]:
with open ("Students.txt") as f:
    text = f.readlines()
text

['\n',
 'Name    Birthday      Email\n',
 '\n',
 'Hamza    17-08-1998    hamza@gmail.com\n',
 'Irfan    20-05-1980    irfan@gmail.com\n',
 'Akram    10-01-1985    akram@gmail.com\n',
 'M Adeel  20-11-1999    adeel@hotmail.com']

In [35]:
text = " ".join(text)
text

'\n Name    Birthday      Email\n \n Hamza    17-08-1998    hamza@gmail.com\n Irfan    20-05-1980    irfan@gmail.com\n Akram    10-01-1985    akram@gmail.com\n M Adeel  20-11-1999    adeel@hotmail.com'

In [36]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
        
emails

['hamza@gmail.com', 'irfan@gmail.com', 'akram@gmail.com', 'adeel@hotmail.com']

### Customizing Tokenizer

In [37]:
from spacy.symbols import ORTH

In [38]:
nlp = spacy.blank("en")
doc = nlp("Gimme your book !")
tokens = [token.text for token in doc]

In [39]:
tokens

['Gimme', 'your', 'book', '!']

In [40]:
nlp.tokenizer.add_special_case("Gimme", [
    {ORTH : "Gim"},
    {ORTH : "me"}
])

doc = nlp("Gimme your book !")
tokens = [token.text for token in doc]

In [41]:
tokens

['Gim', 'me', 'your', 'book', '!']

### Sentence tokenization or Segmentation

In [42]:
doc = nlp("Dr. Fahad is a brilliant techer. He is currently on a semmester break.")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [43]:
nlp.pipeline

[]

In [44]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7001e84200>

In [45]:
doc = nlp("Dr. Fahad is a brilliant techer. He is currently on a semmester break.")
for sentence in doc.sents:
    print(sentence)

Dr. Fahad is a brilliant techer.
He is currently on a semmester break.


In [46]:
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x7001e84200>)]