# Named Entity Recognition

## Import SpaCy in English

In [1]:
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Let’s Try on Real Dataset1

In [2]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.reuters.com/markets/how-companies-are-responding-attacks-ships-red-sea-2023-12-19/')
article = nlp(ny_bb)
len(article.ents)

1

## Have a Look At The NERS

In [3]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [4]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 1})

## Most Popular NER

In [10]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('JS', 1)]

## Let’s Pick One Sentence to Analyze

In [11]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

reuters.comPlease enable JS and disable any ad blocker


## NER Tags

In [12]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [13]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('reuters.comPlease', 'INTJ', 'reuters.complease'),
 ('enable', 'VERB', 'enable'),
 ('JS', 'PROPN', 'JS'),
 ('disable', 'VERB', 'disable'),
 ('ad', 'NOUN', 'ad'),
 ('blocker', 'NOUN', 'blocker')]

## Sentence Dependency Tree

In [14]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

## Let’s Try on Real Dataset2

In [18]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.cnn.com/2024/03/26/politics/trump-hush-money-trial-gag-order/index.html')
article = nlp(ny_bb)
len(article.ents)

270

## Have a Look At The NERS

In [19]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [20]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 27,
         'PERSON': 91,
         'ORG': 77,
         'CARDINAL': 8,
         'PRODUCT': 2,
         'ORDINAL': 2,
         'DATE': 21,
         'EVENT': 3,
         'WORK_OF_ART': 19,
         'LOC': 8,
         'FAC': 1,
         'TIME': 4,
         'NORP': 5,
         'MONEY': 1,
         'LAW': 1})

## Most Popular NER

In [21]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Trump', 43),
 ('CNN', 28),
 ('Merchan', 22),
 ('CNN Video Ad Feedback', 5),
 ('New York', 5)]

## Let’s Pick One Sentence to Analyze

In [22]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

  NY judge issues gag order on Trump in hush money trial | CNN Politics CNN values your feedback                                                         1.


## NER Tags

In [23]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [24]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('  ', 'SPACE', '  '),
 ('NY', 'PROPN', 'NY'),
 ('judge', 'NOUN', 'judge'),
 ('issues', 'NOUN', 'issue'),
 ('gag', 'VERB', 'gag'),
 ('order', 'NOUN', 'order'),
 ('Trump', 'PROPN', 'Trump'),
 ('hush', 'ADJ', 'hush'),
 ('money', 'NOUN', 'money'),
 ('trial', 'NOUN', 'trial'),
 ('CNN', 'PROPN', 'CNN'),
 ('Politics', 'PROPN', 'Politics'),
 ('CNN', 'PROPN', 'CNN'),
 ('values', 'VERB', 'value'),
 ('feedback', 'NOUN', 'feedback'),
 ('                                                        ',
  'SPACE',
  '                                                        '),
 ('1', 'NUM', '1')]

## Sentence Dependency Tree

In [25]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})

## Importing SpaCy in Spanish Language

In [41]:
!python -m spacy download es_core_news_md



Collecting es-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.7.0/es_core_news_md-3.7.0-py3-none-any.whl (42.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-md
Successfully installed es-core-news-md-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [42]:
import spacy

# Load the Spanish NLP model
nlp = spacy.load("es_core_news_md")  # Or "es_core_news_sm" for the small model

# Example Spanish text
text = "Esta es una oración en español."

# Process the text with the model
doc = nlp(text)

# Tokenization
print([token.text for token in doc])

# Part-of-speech tagging and lemmatization
for token in doc:
    print(token.text, token.lemma_, token.pos_)

# Named entity recognition
for ent in doc.ents:
    print(ent.text, ent.label_)


['Esta', 'es', 'una', 'oración', 'en', 'español', '.']
Esta este PRON
es ser AUX
una uno DET
oración oración NOUN
en en ADP
español español NOUN
. . PUNCT


## Let’s Try on Real Dataset3

In [43]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Change 'html5lib' to 'html.parser'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://english.elpais.com/')
article = nlp(ny_bb)
len(article.ents)

173

## Have a Look At The NERS

In [44]:
# Visualize named entities in the article using displacy
from spacy import displacy
displacy.render(article, style='ent', jupyter=True)

## Popular NER Types

In [45]:
# Count the frequency of each named entity label in the article
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'LOC': 15, 'MISC': 85, 'PER': 47, 'ORG': 26})

## Most Popular NER

In [46]:
# Extract the text of each named entity in the article and count the most common 5 entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('English____Mar', 1),
 ('Seisdedos|WashingtonAt the oral arguments in the mifepristone case', 1),
 ('Christian doctors’ association to sue the Food and Drug', 1),
 ('York', 1),
 ('Donald Trump', 1)]

## Let’s Pick One Sentence to Analyze

In [47]:
# Extract sentences from the article and print the first sentence
sentences = [x for x in article.sents]
print(sentences[0])

EL PAÍS English____Mar 27, 2024|Updated 04:49 CET|Select:- - -EspañaAméricaMéxicoColombiaChileArgentinaUSAsubscribeHHOLALOG INInternationalU.S.Economy & BusinessScienceHealthTechnologyClimatePeopleLifestyleOpinionSportsMar 27, 2024|Updated 04:49 CET|subscribe_Supreme Court seems inclined not to restrict access to abortion pill Iker Seisdedos|WashingtonAt the oral arguments in the mifepristone case, a majority of justices appeared skeptical about the legal standing of a Christian doctors’ association to sue the Food and Drug AdministrationNew York judge imposes gag order on Donald Trump in hush money caseJudge Juan M. Merchan on Tuesday cited Trump’s previous comments about him and others involved in the case, as well as a looming April 15 trial date, in granting the prosecution’s request for a gag orderBaltimore’s largest bridge collapses after being struck by shipMiguel Jiménez|BaltimoreEmergency crews have rescued two people from the water and are searching for six others.


## NER Tags

In [48]:
# Visualize named entities in the first sentence of the article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

## Types of Words in Sentence

In [49]:
# Extract orthographic form, part-of-speech, and lemma of non-stopword, non-punctuation tokens in the first sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[0]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('PAÍS', 'PROPN', 'PAÍS'),
 ('English____Mar', 'PROPN', 'English____Mar'),
 ('27', 'NUM', '27'),
 ('2024|Updated', 'DET', '2024|updated'),
 ('04:49', 'NUM', '04:49'),
 ('CET|Select:-', 'PROPN', 'CET|Select:-'),
 ('-EspañaAméricaMéxicoColombiaChileArgentinaUSAsubscribeHHOLALOG',
  'NUM',
  '-españaaméricaméxicocolombiachileargentinausasubscribehholalog'),
 ('INInternationalU.S.Economy', 'PROPN', 'INInternationalU.S.Economy'),
 ('&', 'PROPN', '&'),
 ('BusinessScienceHealthTechnologyClimatePeopleLifestyleOpinionSportsMar',
  'PROPN',
  'BusinessScienceHealthTechnologyClimatePeopleLifestyleOpinionSportsMar'),
 ('27', 'NUM', '27'),
 ('2024|Updated', 'DET', '2024|updated'),
 ('04:49', 'NUM', '04:49'),
 ('CET|subscribe_Supreme', 'NUM', 'cet|subscribe_supreme'),
 ('Court', 'PROPN', 'Court'),
 ('seems', 'PROPN', 'seems'),
 ('inclined', 'PROPN', 'inclined'),
 ('not', 'PROPN', 'not'),
 ('to', 'PROPN', 'to'),
 ('restrict', 'PROPN', 'restrict'),
 ('access', 'PROPN', 'access'),
 ('to', 'PROPN', 'to

## Sentence Dependency Tree

In [50]:
# Visualize dependency parse of the first sentence of the article with custom distance between words
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True,
                options = {'distance': 120})