In [95]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_md")

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [11]:
for ner in nlp.pipe_labels["ner"]:
    print(f"{ner:<20s}{spacy.explain(ner)}")

CARDINAL            Numerals that do not fall under another type
DATE                Absolute or relative dates or periods
EVENT               Named hurricanes, battles, wars, sports events, etc.
FAC                 Buildings, airports, highways, bridges, etc.
GPE                 Countries, cities, states
LANGUAGE            Any named language
LAW                 Named documents made into laws.
LOC                 Non-GPE locations, mountain ranges, bodies of water
MONEY               Monetary values, including unit
NORP                Nationalities or religious or political groups
ORDINAL             "first", "second", etc.
ORG                 Companies, agencies, institutions, etc.
PERCENT             Percentage, including "%"
PERSON              People, including fictional
PRODUCT             Objects, vehicles, foods, etc. (not services)
QUANTITY            Measurements, as of weight or distance
TIME                Times smaller than a day
WORK_OF_ART         Titles of books, songs,

In [2]:
doc = nlp("The president Donald Trump visited France.")
doc.ents

(Donald Trump, France)

In [8]:
for token in doc:
    print(f"{token.text:<15s}{token.ent_type_:<15s}{spacy.explain(token.ent_type_)}")

The                           None
president                     None
Donald         PERSON         People, including fictional
Trump          PERSON         People, including fictional
visited                       None
France         GPE            Countries, cities, states
.                             None


In [34]:
import feedparser
import requests
from bs4 import BeautifulSoup

In [13]:
hindu = feedparser.parse("https://www.thehindu.com/news/national/feeder/default.rss")

In [17]:
hindu["feed"]["title"]

'| The Hindu'

In [19]:
len(hindu.entries)

100

In [20]:
post = hindu.entries

In [40]:
post[0]["link"]

'https://www.thehindu.com/news/national/committee-of-pm-lop-cji-to-advice-on-appointment-of-election-commissioners-supreme-court/article66570806.ece'

In [33]:
res = requests.get(post[0]["link"])
res.status_code

200

In [35]:
soup = BeautifulSoup(res.text)

In [43]:
for script in soup(["script","style","aside","articlebodycontent"]):
    script.extract()

In [58]:
help(soup.findChildren)

Help on method find_all in module bs4.element:

find_all(name=None, attrs={}, recursive=True, string=None, limit=None, **kwargs) method of bs4.BeautifulSoup instance
    Look in the children of this PageElement and find all
    PageElements that match the given criteria.
    
    All find_* methods take a common set of arguments. See the online
    documentation for detailed explanations.
    
    :param name: A filter on tag name.
    :param attrs: A dictionary of filters on attribute values.
    :param recursive: If this is True, find_all() will perform a
        recursive search of this PageElement's children. Otherwise,
        only the direct children will be considered.
    :param limit: Stop looking after finding this many results.
    :kwargs: A dictionary of filters on attribute values.
    :return: A ResultSet of PageElements.
    :rtype: bs4.element.ResultSet



In [61]:
contents = soup.findAll(name="div",attrs={"class":"articlebodycontent"})[0].findChildren("p",recursive=False)

In [67]:
text = " ".join([content.get_text().strip() for content in contents])

In [68]:
doc = nlp(text)

In [86]:
help(doc.count_by)

Help on built-in function count_by:

count_by(...) method of spacy.tokens.doc.Doc instance
    Doc.count_by(self, attr_id_t attr_id, exclude=None, counts=None)
    Count the frequencies of a given attribute. Produces a dict of
            `{attribute (int): count (ints)}` frequencies, keyed by the values of
            the given attribute ID.
    
            attr_id (int): The attribute ID to key the counts.
            RETURNS (dict): A dictionary mapping attributes to integer counts.
    
            DOCS: https://spacy.io/api/doc#count_by



In [92]:
for span in doc.ents:
    print(f"{span.label_:<15s}{span.text}")

ORG            the Lok Sabha
GPE            India
ORG            Election Commissioners
PRODUCT        the Constitution Bench of the Supreme Court
ORG            Election Commissioners
PRODUCT        The Constitution Bench
ORG            Parliament
ORG            the Union of India
ORG            the Election Commission of India
ORG            Bench
ORG            ECI
CARDINAL       five
PERSON         Bench
PERSON         K.M. Joseph
PERSON         Prashant Bhushan
PERSON         Kaleeswaram Raj
PERSON         Gopal Sankaranarayanan
ORG            Opposition
GPE            India
ORG            Central Bureau of Investigation
ORG            CBI
ORG            Centre
ORG            Election Commissioners
ORG            Centre
PERSON         T.N. Seshan
PERSON         Sankaranarayanan
PERSON         Ashwini Upadhyay
ORG            Election Commissioners
ORG            the Consolidated Fund of India


In [93]:
from collections import defaultdict
count = defaultdict(int)
for span in doc.ents:
    count[span.label_] += 1 
    

In [94]:
count

defaultdict(int,
            {'ORG': 16, 'GPE': 2, 'PRODUCT': 2, 'CARDINAL': 1, 'PERSON': 8})

In [89]:
for token in doc:
    if token.ent_type_:
        print(f"{token.text:<20s}{token.ent_type_:20s}{str(token.ent_type):<5}")

the                 ORG                 383  
Lok                 ORG                 383  
Sabha               ORG                 383  
India               GPE                 384  
Election            ORG                 383  
Commissioners       ORG                 383  
the                 PRODUCT             386  
Constitution        PRODUCT             386  
Bench               PRODUCT             386  
of                  PRODUCT             386  
the                 PRODUCT             386  
Supreme             PRODUCT             386  
Court               PRODUCT             386  
Election            ORG                 383  
Commissioners       ORG                 383  
The                 PRODUCT             386  
Constitution        PRODUCT             386  
Bench               PRODUCT             386  
Parliament          ORG                 383  
the                 ORG                 383  
Union               ORG                 383  
of                  ORG           

In [96]:
displacy.render(doc,style="ent")