In [0]:
import spacy
import pandas as pd
import numpy as np

from spacy import displacy

In [0]:
#Let's load pre-trained english model

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

In [0]:
string1 = 'The economic taskforce will be headed by finance minister Nirmala Sitharaman'

In [11]:
doc1 = nlp(string1)    #Creating doc object 
doc1

The economic taskforce will be headed by finance minister Nirmala Sitharaman

###Word Tokenization

In [21]:
for i,word_token in enumerate(doc1):       #Word Tokenizing
  print(i,word_token.text)

0 The
1 economic
2 taskforce
3 will
4 be
5 headed
6 by
7 finance
8 minister
9 Nirmala
10 Sitharaman


In [0]:
string2 = 'Be it doctors, nurses, hospital staff, housekeeping staff, airlines staff, 	government servants, police personnel, mediapersons, railways, bus, 	autorickshaw, home delivery people — they are serving others, by not taking care 	of themselves,” he said. He asked people for their support in the upcoming weeks 	as no vaccine had yet been discovered'

In [0]:
doc2 = nlp(string2)

###Sentence Tokenization

In [23]:
#Sentence Tokenizing
for i,sent_token in enumerate(doc2.sents):
  print(i,sent_token)

0 Be it doctors, nurses, hospital staff, housekeeping staff, airlines staff, 	government servants, police personnel, mediapersons, railways, bus, 	autorickshaw, home delivery people — they are serving others, by not taking care 	of themselves,” he said.
1 He asked people for their support in the upcoming weeks 	as no vaccine had yet been discovered


##Lemmatization <br/>
Lemmatization helps to  find the actual words in indefinite tense

In [0]:
string3 = 'They are like protectors of the nation. The country is obliged to them'
doc_lemma = nlp(string3)

In [44]:
for token in doc_lemma:
  print(token.text,token.lemma_,token.lemma_.lower().strip())

They -PRON- -pron-
are be be
like like like
protectors protector protector
of of of
the the the
nation nation nation
. . .
The the the
country country country
is be be
obliged oblige oblige
to to to
them -PRON- -pron-


In [0]:
#If we want the data in tabular form

lemma = []
for x, token in enumerate(doc_lemma):
  lemma.append([x,token.text,token.lemma_])

In [36]:
lemma_df = pd.DataFrame(lemma,columns=['Index','Text','Lemma'])
lemma_df

Unnamed: 0,Index,Text,Lemma
0,0,They,-PRON-
1,1,are,be
2,2,like,like
3,3,protectors,protector
4,4,of,of
5,5,the,the
6,6,nation,nation
7,7,.,.
8,8,The,the
9,9,country,country


##Parts of Speech Tagging

In [0]:
doc_lemma2 = nlp('He asked citizens to act with determination and patience') 

In [0]:
pos = []
for i,token in enumerate(doc_lemma2):
  pos.append([i,token.text,token.tag_])

In [50]:
pos_df = pd.DataFrame(pos,columns=['Index','Text','POS'])
pos_df 

Unnamed: 0,Index,Text,POS
0,0,He,PRP
1,1,asked,VBD
2,2,citizens,NNS
3,3,to,TO
4,4,act,VB
5,5,with,IN
6,6,determination,NN
7,7,and,CC
8,8,patience,NN


##Tagger<br/>
It returns Parts of Speech in Tree bank format(More detailed POS about words)

In [52]:
tag = []
for i in doc_lemma2:
  tag.append([i.text,i.tag_])
pd.DataFrame(tag,columns=['Words','POS'])

Unnamed: 0,Words,POS
0,He,PRP
1,asked,VBD
2,citizens,NNS
3,to,TO
4,act,VB
5,with,IN
6,determination,NN
7,and,CC
8,patience,NN


In [56]:
spacy.explain('VBD')

'verb, past tense'

## Dependency parsing
It shows us which words are dependent on which other words<br/>
**Syntactic Dependency**<br/>
It helps us to know the relation betweeen the tokens

In [62]:
dep = []
for i in doc_lemma2:
  dep.append([i.text,i.dep_])
pd.DataFrame(dep,columns=['Word','Dep'])

Unnamed: 0,Word,Dep
0,He,nsubj
1,asked,ROOT
2,citizens,dobj
3,to,aux
4,act,xcomp
5,with,prep
6,determination,pobj
7,and,cc
8,patience,conj


**Let's visualize the entire model**

In [0]:
options = {'compact':True,'bg':'seagreen','color':'#FFF','font':'Sans Sarif'}

In [66]:
displacy.render(doc_lemma2,style='dep',jupyter=True,options=options)

In [72]:
#Another example
dep1=[]
for i,token in enumerate(doc_lemma):
  dep1.append([i,token.text,token.dep_])
pd.DataFrame(dep1,columns=['Index','Word','Dep'])

Unnamed: 0,Index,Word,Dep
0,0,They,nsubj
1,1,are,ROOT
2,2,like,prep
3,3,protectors,pobj
4,4,of,prep
5,5,the,det
6,6,nation,pobj
7,7,.,punct
8,8,The,det
9,9,country,nsubjpass


In [73]:
displacy.render(doc_lemma,style='dep',jupyter=True,options=options)

**Stopwords**

In [0]:
from spacy.lang.en import STOP_WORDS

In [75]:
len(STOP_WORDS)

326

In [79]:
#Let's check stopwords 
for i in doc1:
  if i.is_stop!=True:    #Not stopwords
    print(i)

economic
taskforce
headed
finance
minister
Nirmala
Sitharaman


**We can add stopwords to the list**

In [0]:
STOP_WORDS.add('open')

In [83]:
nlp.vocab['visit'].is_stop

False

In [93]:
for i in doc_lemma:
  if i.is_stop!=True:
    print(i)


like
protectors
nation
.
country
obliged


In [0]:
doc3='The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting'

In [110]:
add_stop = ['door','visit']
for word in add_stop:
  nlp.vocab[word].is_stop == True
ex = nlp(doc3)
new_text = [word.text for word in ex if word.is_stop == False]
print('Actual text \n',ex)
print('Filtered Text \n',new_text) 



Actual text 
 The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting
Filtered Text 
 ['door', 'till', '7PM', ',', 'Abhishek', 'fought', 'visit', 'Croatia', 'postponed', '24', 'hrs', 'meeting']


**Token Similarity**

Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.

other (object): The object to compare with. By default, accepts `Doc`,
    `Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.

In [112]:
#Let's create an object

token = nlp(u'I need sometime')

for token1 in token:
  for token2 in token:
    print(token1.text,token2.text,token1.similarity(token2)) 

I I 1.0
I need -0.03315466
I sometime -0.06914742
need I -0.03315466
need need 1.0
need sometime -0.05387174
sometime I -0.06914742
sometime need -0.05387174
sometime sometime 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


**Merging and splitting**

Mark a span for merging. The attrs will be applied to the resulting
token.

span (Span): The span to merge.
attrs (dict): Attributes to set on the merged token

In [118]:
nlp = spacy.load('en_core_web_sm')
docs = nlp('I live in New Delhi')
print('Before:',[token.text for token in docs])

with docs.retokenize() as retokenizer:
  retokenizer.merge(docs[3:5], attrs={'Lemma':'New Delhi'})
print('After:',[token.text for token in docs]) 

Before: ['I', 'live', 'in', 'New', 'Delhi']
After: ['I', 'live', 'in', 'New Delhi']


**Checking if the character in the text are aphabet**

In [123]:
for token in docs:
  print(token.text,token.is_alpha,token.is_punct,token.is_digit)

I True False False
live True False False
in True False False
New Delhi False False False


## Entity recognition<br/>
This is very important in te space of NLP. It helps us know the domain or entity a word belongs to




The named entities in the document. Returns a tuple of named entity
`Span` objects, if the entity recognizer has been applied.

RETURNS (tuple): Entities in the document, one `Span` per entity.


In [127]:
ex1 = nlp('The economic taskforce will be headed by finance minister of India Nirmala Sitharaman')
for i in ex1.ents:     #ents = entities
  print(i.text,i.label_)

India GPE
Nirmala Sitharaman PERSON


In [133]:
displacy.render(ex1,style='ent',jupyter=True)

In [129]:
#Another example
ex2=nlp('The door was open till 7PM, Abhishek fought to visit Croatia but postponed due to 24 hrs meeting')

for x in ex2.ents:
  print(x.text,x.label_) 

7PM CARDINAL
Abhishek PERSON
Croatia ORG
24 CARDINAL


In [132]:
displacy.render(ex2,style='ent',jupyter=True)

**Entity annotation**<br/>
When no entity is assigned, we can assign them a particular entity

In [139]:
from spacy.tokens import Span

doc = nlp('RBI takes care of banking activites')
first = [(i.text,i.start_char,i.end_char,i.label_) for i in doc.ents]
print(first)

tn_ent = Span(doc,0,1,label = 'ORG')

doc_ents = list(doc.ents)+[tn_ent]
[(i.text,i.label_) for i in doc_ents]

[]


[('RBI', 'ORG')]