# Named Entity Recognition

## SpaCy

In [1]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable


## Import SpaCy in English

In [2]:
!python -m spacy download en_core_web_lg

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.1/587.7 MB 787.7 kB/s eta 0:12:26
     ---------------------------------------- 0.4/587.7 MB 3.0 MB/s eta 0:03:14
     ---------------------------------------- 1.0/587.7 MB 5.5 MB/s eta 0:01:48
     ---------------------------------------- 1.6/587.7 MB 7.2 MB/s eta 0:01:22
     ---------------------------------------- 2.5/587.7 MB 9.9 MB/s eta 0:01:00
     --------------------------------------- 3.6/587.7 MB 12.1 MB/s eta 0:00:49
     --------------------------------------- 4.6/587.7 MB 14.1 MB/s eta 0:00:42
     ----------------------------------

In [6]:
import spacy
nlp = spacy.load("en_core_web_lg")


In [7]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end="|")

My|best|friend|Ryan|Peters|likes|fancy|adventure|games|.|

In [8]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end="|")

My|best|friend|Ryan|Peters|likes|fancy|adventure|games|.|

## What are the attributes that SpaCy adds?

In [9]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


## Removing Stop words using Spacy

In [10]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


## Find all nouns using Spacy

In [42]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


## Named Entity Recognition

In [11]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

## Harder one:

In [12]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

## Visualize NERS:

In [13]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


In [15]:
!pip install html5lib

Defaulting to user installation because normal site-packages is not writeable
Collecting html5lib
  Obtaining dependency information for html5lib from https://files.pythonhosted.org/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl.metadata
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
   ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
   --- ------------------------------------ 10.2/112.2 kB ? eta -:--:--
   ---------------------------------------- 112.2/112.2 kB 1.6 MB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1


## Let’s try a real dataset

In [19]:
from bs4 import BeautifulSoup
import requests
import re
import spacy

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Changed to 'html.parser' for compatibility
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Example URL - replace 'https://ieeexplore.ieee.org/Xplore/home.jsp' with any URL you want to scrape
ny_bb = url_to_string('https://blackboard.sacredheart.edu/ultra/institution-page')

# Load spaCy's English language model. 
# Make sure to download the model first using "python -m spacy download en_core_web_sm"
nlp = spacy.load("en_core_web_sm")

# Process the extracted text
article = nlp(ny_bb)

# Output the number of named entities found
print(len(article.ents))

# Optionally, print out each entity
for ent in article.ents:
    print(ent.text, ent.label_)


16
JavaScript PRODUCT
Blackboard PERSON
Multi-Factor Authentication ORG
1 CARDINAL
Google Authenticator ORG
Authy PERSON
2 Open TIME
3 CARDINAL
Cancel Next Multi-Factor Authentication PERSON
6 CARDINAL
Back Submit Multi-Factor Authentication PERSON
Done Sign PERSON
third ORDINAL
MySHU Portal PERSON
1997-2024 DATE
All Rights Reserved WORK_OF_ART


## Have a look at the NERS

In [20]:
displacy.render(article, style='ent', jupyter=True)


## Most popular NER types

In [26]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)


Counter({'PERSON': 6,
         'CARDINAL': 3,
         'ORG': 2,
         'PRODUCT': 1,
         'TIME': 1,
         'ORDINAL': 1,
         'DATE': 1,
         'WORK_OF_ART': 1})

## Most popular NER

In [27]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('JavaScript', 1),
 ('Blackboard', 1),
 ('Multi-Factor Authentication', 1),
 ('1', 1),
 ('Google Authenticator', 1)]

## Let’s pick one sentence to analyze

In [36]:
sentences = [x for x in article.sents]
print(sentences[1])


Username Password Forgot Password?


## NER tags

In [43]:
displacy.render(nlp(str(sentences[1])), jupyter=True, style='ent')


## Types of words in the sentence

In [39]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[1])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Username', 'PROPN', 'Username'),
 ('Password', 'PROPN', 'Password'),
 ('Forgot', 'PROPN', 'Forgot'),
 ('Password', 'PROPN', 'Password')]

## Sentence dependency tree

In [41]:
displacy.render(nlp(str(sentences[2])), style='dep', jupyter = True, options = {'distance': 120})
