In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import spacy

# Blank NLP Pipeline

In [5]:
nlp = spacy.blank("en")

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [6]:
nlp.pipe_names

[]

### Download Trained Pipeline
To download trained pipeline use a command such as,

python -m spacy download en_core_web_sm

This downloads the small (sm) pipeline for english language

In [7]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [13]:
nlp.pipeline


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fa9f5c208e0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fa9f5c20040>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fa9f5da1e40>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fa9f5a5ec40>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fa9f5a1c080>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fa9f5da1dd0>)]

In [9]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)


Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


# Named Entity Recognition

In [10]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [11]:
from spacy import displacy

displacy.render(doc, style="ent")

### Trained processing pipeline in French

- You need to install the processing pipeline for french language using this command,

- python -m spacy download fr_core_news_sm

In [15]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [16]:
nlp = spacy.load("fr_core_news_sm")


In [17]:
doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [18]:
for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Tesla  |  PROPN  |  Tesla
Inc  |  PROPN  |  Inc
va  |  VERB  |  aller
racheter  |  VERB  |  racheter
Twitter  |  VERB  |  twitter
pour  |  ADP  |  pour
$  |  NOUN  |  dollar
45  |  NUM  |  45
milliards  |  NOUN  |  milliard
de  |  ADP  |  de
dollars  |  NOUN  |  dollar


# Adding a component to a blank pipeline

In [19]:
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")
nlp.add_pipe("ner", source=source_nlp)
nlp.pipe_names

['ner']

In [20]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [21]:
from spacy import displacy

displacy.render(doc, style="ent")

# Excersie: 1
- Get all the proper nouns from a given text in a list and also count how many of them.
- Proper Noun means a noun that names a particular person, place, or thing.

In [30]:
text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and 
visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.
They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!
'''

# https://spacy.io/usage/linguistic-features

#creating the nlp object
doc = nlp(text)   


#list for storing the proper nouns
all_proper_nouns = []  


for token in doc:
    if token.pos_ == "PROPN":        #checking the whether token belongs to parts of speech "PROPN" [Proper Noun]
        all_proper_nouns.append(token)
  

 #finally printing the results
print("Proper Nouns: ", all_proper_nouns)
print("Count: ", len(all_proper_nouns))

Proper Nouns:  []
Count:  0


# Excersie: 2
- Get all companies names from a given text and also the count of them.
- Hint: Use the spacy ner functionality

In [25]:
text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in 
India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''


doc = nlp(text)

#list for storing the company names
all_company_names = []

for ent in doc.ents:
    if ent.label_ == 'ORG':     #checking the whether token belongs to entity "ORG" [Organisation]
        all_company_names.append(ent)



#finally printing the results
print("Company Names: ", all_company_names)
print("Count: ", len(all_company_names))

Company Names:  [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti]
Count:  10


# END