In [2]:
!pip install spacy



In [1]:
# python -m spacy download en_core_web_sm
import spacy
# spacy.cli.download("en_core_web_sm")

In [2]:
nlp = spacy.load('en_core_web_sm') 
#en_core_web_sm is a small English pipeline trained on written web text (blogs, news, comments), 
# that includes vocabulary, syntax and entities.

In [3]:
#Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [4]:
doc1= nlp("Apple is looking at buying U.K. startup for $1 billion")

show_ents(doc1)

Apple - 0 - 5 - ORG - Companies, agencies, institutions, etc.
U.K. - 27 - 31 - GPE - Countries, cities, states
$1 billion - 44 - 54 - MONEY - Monetary values, including unit


In [5]:
doc2 = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc2)

Washington, DC - 12 - 26 - GPE - Countries, cities, states
next May - 27 - 35 - DATE - Absolute or relative dates or periods
the Washington Monument - 43 - 66 - ORG - Companies, agencies, institutions, etc.


In [6]:
doc3 = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc3.ents:
    print(ent.text, ent.label_, spacy.explain(ent.label_))

500 dollars MONEY Monetary values, including unit
Microsoft ORG Companies, agencies, institutions, etc.


In [7]:
doc4 = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
for e in doc4.ents:
    print(e.text, e.start_char, e.end_char, e.label_)
# OR
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc4.ents] #in a list comprehension form
print(ents)

San Francisco 0 13 GPE
[('San Francisco', 0, 13, 'GPE')]


In [8]:
# token level
# doc[0], doc[1] ...will have tokens stored.

ent_san = [ doc4[0].text,  doc4[0].ent_iob_,  doc4[0].ent_type_ ]
ent_francisco = [ doc4[1].text,  doc4[1].ent_iob_,  doc4[1].ent_type_ ]

print(ent_san)
print(ent_francisco)
print("_________________")
for i in range(0,7):
    res = [ doc4[i].text,  doc4[i].ent_iob_,  doc4[i].ent_type_ ]
    print(res)


['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']
_________________
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']
['considers', 'O', '']
['banning', 'O', '']
['sidewalk', 'O', '']
['delivery', 'O', '']
['robots', 'O', '']


### User Defined Named Entity and Adding it to a Span 

###### Example 1

In [9]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - 17 - 21 - GPE - Countries, cities, states
$6 million - 34 - 44 - MONEY - Monetary values, including unit


In [10]:
from spacy.tokens import Span

In [11]:
# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [12]:
show_ents(doc)

Tesla - 0 - 5 - ORG - Companies, agencies, institutions, etc.
U.K. - 17 - 21 - GPE - Countries, cities, states
$6 million - 34 - 44 - MONEY - Monetary values, including unit


###### Example 2

In [13]:
doc = nlp("is hiring a new vice president of fb global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)

Before []


In [14]:
#the model didn't recognise "fb" as an entity :(

fb_ent = Span(doc, 7, 10, label="ORG") # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]

# show_ents(doc)
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)

After [('fb global policy', 34, 50, 'ORG')]


### Visualizing NER

In [15]:
# Import the displaCy library
from spacy import displacy

In [16]:
text = "When S. Thrun started working on self driving cars at Google in 2007 few people outside of the company took him serious"
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)

In [19]:
text="""On July 18, 2023, John Smith and Sarah Johnson 
attended the grand opening of Blue Sky Mall, located in the bustling city of New York. 
The event took place at 6:00 PM in the mall's central courtyard, which was beautifully adorned with 
colorful decorations. Prominent brands such as Apple, Nike, and Coca-Cola showcased their latest products, 
attracting a large crowd of enthusiastic shoppers. The event was organized by XYZ Events, a renowned event management 
company known for its exceptional attention to detail and flawless execution."""
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)

In [20]:
text = """Clearview AI, a New York-headquartered facial recognition company, has been fined £7.5 million ($9.4 million) by a U.K. privacy regulator.

Over the last few years, the firm has collected images from the web and social media of people in Britain and elsewhere to create a global online database that can be used by law enforcement for facial recognition.

The Information Commission’s Office said Monday that the company has breached U.K. data protection laws.

The ICO has ordered Clearview to delete data it has on U.K. residents and banned it from collecting any more.

Clearview writes on its website that it has collected more than 20 billion facial images of people around the world. It collects publicly posted images from social media platforms like Facebook and Instagram, as well as news media, mugshot websites and other open sources. It does so without informing the individuals or asking for their consent.

Clearview’s platform allows law enforcement agencies to upload a photo of an individual and try to match it to photos that are stored in Clearview’s database.

John Edwards, the U.K.’s information commissioner, said in a statement: “The company not only enables identification of those people, but effectively monitors their behavior and offers it as a commercial service. That is unacceptable.”

He added that people expect their personal information to be respected, regardless of where in the world their data is being used."""

doc = nlp(text)

displacy.render(doc, style='ent', jupyter=True)

### Visualizing Sentences Line by Line

In [21]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)



### Styling: customize color and effects

In [22]:
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(doc, style='ent', jupyter=True, options=options)

In [23]:
colors = {'ORG': 'linear-gradient(90deg, #f2c707, #dc9ce7)', 'PRODUCT': 'radial-gradient(white, green)'}

options = {'ents': ['ORG', 'PRODUCT'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options=options)

In [27]:
colors = {'ORG':'linear-gradient(90deg,#aa9cde,#dc9ce7)','PRODUCT':'radial-gradient(white,red)'}
options = {'ent':['ORG','PRODUCT'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

### Adding Named Entities to All Matching Spans

In [28]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first - 99 - 104 - ORDINAL - "first", "second", etc.


In [42]:
nlp.vocab

<spacy.vocab.Vocab at 0x1ecda1bfc10>

In [46]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [52]:
nlp('ta')

ta

In [47]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]
# match[1] contains the start index of the the token and match[2] the stop index (exclusive) of the token in the doc.

doc.ents = list(doc.ents) + new_ents

In [53]:
doc.ents

(29.50, five dollars)

In [48]:
show_ents(doc)

vacuum cleaner - 37 - 51 - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - 72 - 86 - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - 99 - 104 - ORDINAL - "first", "second", etc.


In [49]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - 22 - 27 - MONEY - Monetary values, including unit
five dollars - 60 - 72 - MONEY - Monetary values, including unit


In [50]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2