In [35]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [36]:
doc=nlp("Hi my name is kuntal.")

In [37]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [38]:
type(doc)

spacy.tokens.doc.Doc

### tokenizing

In [39]:
for token in doc:
  print(token.text)

Hi
my
name
is
kuntal
.


### pos_tagging

In [40]:
[print(token.text,"--->",token.pos_) for token in doc]

Hi ---> INTJ
my ---> PRON
name ---> NOUN
is ---> AUX
kuntal ---> NOUN
. ---> PUNCT


[None, None, None, None, None, None]

In [41]:
spacy.explain("INTJ")

'interjection'

Sentence Identification

In [42]:
about_text = (
     "Gus Proto is a Python developer currently"
" working for a London-based Fintech"
     " company. He is interested in learning"
     " Natural Language Processing." )
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)
for sentence in sentences:
  print(f"{sentence[:5]}...")


Gus Proto is a Python...
He is interested in learning...


In [43]:
print(sentences)

[Gus Proto is a Python developer currently working for a London-based Fintech company., He is interested in learning Natural Language Processing.]


In [44]:
for token in about_doc:
  print(str(token)+"         "+str(token.is_alpha)+"    "+str(token.is_stop)+"    "+str(token.like_num)+"    "+str(token.is_title))


Gus         True    False    False    True
Proto         True    False    False    True
is         True    True    False    False
a         True    True    False    False
Python         True    False    False    True
developer         True    False    False    False
currently         True    False    False    False
working         True    False    False    False
for         True    True    False    False
a         True    True    False    False
London         True    False    False    True
-         False    False    False    False
based         True    False    False    False
Fintech         True    False    False    True
company         True    False    False    False
.         False    False    False    False
He         True    True    False    True
is         True    True    False    False
interested         True    False    False    False
in         True    True    False    False
learning         True    False    False    False
Natural         True    False    False    True
Langua

### Excersie

In [45]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

#you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [46]:
doc=nlp(text)
url=[]
for i in doc:
  if(i.like_url==True):
    url.append(i)
url


[http://www.data.gov/,
 http://www.science.gov/,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [47]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# Extract all money transaction from below sentence along with currency. Output should be, two $  , 500 €

In [48]:
doc=nlp(transactions)
for i in range(len(doc)):
  if doc[i].like_num and doc[i+1].is_currency:
    print(str(doc[i])+" "+str(doc[i+1]))

two $
500 €


creating a blank pipeline

In [49]:
nlp_blank=spacy.blank('en')
nlp_blank.pipe_names

[]

part of speech tagging and lemmitization

In [50]:
text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and 
visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.
They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!'''

document=nlp(text)

for token in document:
  print(str(token.text)+" | "+str(spacy.explain(token.pos_))+" | "+str(token.lemma_))

Ravi | noun | ravi
and | coordinating conjunction | and
Raju | proper noun | Raju
are | auxiliary | be
the | determiner | the
best | adjective | good
friends | noun | friend
from | adposition | from
school | noun | school
days | noun | day
. | punctuation | .
They | pronoun | they
wanted | verb | want
to | particle | to
go | verb | go
for | adposition | for
a | determiner | a
world | noun | world
tour | noun | tour
and | coordinating conjunction | and

 | space | 

visit | verb | visit
famous | adjective | famous
cities | noun | city
like | adposition | like
Paris | proper noun | Paris
, | punctuation | ,
London | proper noun | London
, | punctuation | ,
Dubai | proper noun | Dubai
, | punctuation | ,
Rome | proper noun | Rome
etc | other | etc
and | coordinating conjunction | and
also | adverb | also
they | pronoun | they
called | verb | call
their | pronoun | their
another | determiner | another
friend | noun | friend
Mohan | proper noun | Mohan
to | particle | to
take | verb | take


Named Entity Recognization

In [51]:
for i in doc.ents:
  print(i.text+" ----->> "+i.label_)

Tony ----->> PERSON
two $ ----->> MONEY
Peter ----->> PERSON
Bruce ----->> PERSON
500 € ----->> MONEY


In [52]:
from spacy import displacy
displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tony\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n gave \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    two $\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n to \n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Peter\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n, \n<ma

### spacy pipeline exercise code

Excersie: 1
Get all the proper nouns from a given text in a list and also count how many of them.
Proper Noun means a noun that names a particular person, place, or thing.




In [53]:
text = ''' Ravi and Raju are the best friends from school days.They wanted to go for a world tour and 
visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.
They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments! '''

In [54]:
doc=nlp(text)
lst=[]
for i in doc:
  if(spacy.explain(i.pos_)=='proper noun'):
    lst.append(i.text)
print(f"{lst} ====>>> {len(lst)}")

['Ravi', 'Raju', 'Paris', 'London', 'Dubai', 'Rome', 'Mohan', 'Hyderabad'] ====>>> 8


Excersie: 2
Get all companies names from a given text and also the count of them.
Hint: Use the spacy ner functionality

In [55]:
text = ''' The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in 
India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel '''
doc1=nlp(text)
lst1=[]
for i in doc1.ents:
  if i.label_=='ORG':
    lst1.append(i.text)
print(f"{lst1} ====>>> {len(lst1)}")

['Walmart', 'Amazon', 'Microsoft', 'Google', 'Infosys', 'Reliance', 'HDFC Bank', 'Hindustan Unilever'] ====>>> 8
