In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
doc = nlp('Nintendo Co Ltd 7974.T said on Thursday third-quarter operating profit rose 6%,driven by Switch console sales in the year-end shopping season, but the earnings fell below market expectations.Profit for the October-December quarter was 168.7 billion yen ($1.54 billion) versus 158.6 billion yen a year earlier.That compared with an average forecast of 175 billion yen from 10 analyst estimates compiled by Refinitiv.')
ner = [(X.text, X.label_) for X in doc.ents] # Returns a tuple of named entity of span objects
ner

[('Nintendo Co Ltd 7974.T', 'ORG'),
 ('Thursday third-quarter', 'DATE'),
 ('6%,driven', 'CARDINAL'),
 ('Switch', 'NORP'),
 ('year-end shopping season', 'DATE'),
 ('October-December quarter', 'DATE'),
 ('168.7 billion yen', 'MONEY'),
 ('$1.54 billion', 'MONEY'),
 ('158.6 billion yen', 'MONEY'),
 ('a year earlier', 'DATE'),
 ('175 billion yen', 'MONEY'),
 ('10', 'CARDINAL')]

In [25]:
from bs4 import BeautifulSoup
import requests
import re
def html_text(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html,'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r"[\n\t]+", soup.get_text()))

In [26]:
ny_bb = html_text('https://www.cnbc.com/2022/06/24/dan-yergin-on-oil-prices-falling-despite-tight-supply-russia-tensions.html')
article = nlp(ny_bb)
len(article.ents)

120

In [21]:
labels = [X.label_ for X in article.ents]
Counter(labels)

Counter({'CARDINAL': 9,
         'DATE': 17,
         'GPE': 22,
         'LOC': 3,
         'MONEY': 4,
         'NORP': 3,
         'ORG': 39,
         'PERCENT': 4,
         'PERSON': 15,
         'PRODUCT': 1,
         'QUANTITY': 1,
         'TIME': 1,
         'WORK_OF_ART': 1})

In [22]:
items = [X.text for X in article.ents]
Counter(items).most_common(4)

[('Russia', 6), ('OPEC+', 5), ('CNBC', 5), ('Ukraine', 4)]

In [24]:
sentences = [X for X in article.sents]
print(sentences[20])

Authorities in the region are now scrambling to fill underground storage with natural gas supplies.


Using spaCy built-in displaCy visualizer

In [31]:
displacy.render(doc, style="ent", jupyter=True)

In [32]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})