In [1]:
import spacy

In [2]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.2/12.8 MB 7.6 MB/s eta 0:00:02
     - -------------------------------------- 0.4/12.8 MB 4.6 MB/s eta 0:00:03
     - -------------------------------------- 0.6/12.8 MB 4.3 MB/s eta 0:00:03
     -- ------------------------------------- 0.8/12.8 MB 4.1 MB/s eta 0:00:03
     --- ------------------------------------ 1.0/12.8 MB 4.3 MB/s eta 0:00:03
     --- ------------------------------------ 1.3/12.8 MB 4.2 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 4.3 MB/s eta 0:00:03
     ----- ---------------------------------- 1.7/12.8 MB 4.4 MB/s eta 0:00:03
     ----- ---------------------------------- 1.9/12.8 MB 4.3 MB/s eta 0:00:03
     ------ -----------------------------

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
sent = nlp('''Mark Zukerber will meet Aditya Joshi
    on Monday 6th June 2024, 10am for $3 Trilion deal
    at Mumbai''')

In [5]:
sent.ents

(Mark Zukerber, Aditya Joshi, Monday 6th June 2024, 10am, $3 Trilion, Mumbai)

In [6]:
for ent in sent.ents:
    print(ent.text,'-->',ent.label_)

Mark Zukerber --> PERSON
Aditya Joshi --> PERSON
Monday 6th June 2024 --> DATE
10am --> TIME
$3 Trilion --> MONEY
Mumbai --> GPE


In [7]:
raw_text = "Alaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world. It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continent's most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland.[6] The state contains the second-largest and largest cities in the United States by area: the state capital of Juneau, and its former capital, Sitka, respectively. The state's most populous city is Anchorage and approximately half of Alaska's residents live within its metropolitan area."
text = nlp(raw_text)

In [8]:
text

Alaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world. It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continent's most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland.[6] The state contains the second-largest and largest cities in the United States by area: the state capital of Juneau, and its former capital, Sitka, respectively. The state's most populous city is Anchorage and approximately half of Alaska's residents live within its metropolitan area.

In [9]:
for ent in text.ents:
    print(ent.text,'-->',ent.label_)   

Alaska --> GPE
U.S. --> GPE
three --> CARDINAL
Texas --> GPE
California --> GPE
Montana --> GPE
seventh --> ORDINAL
third --> ORDINAL
U.S. --> GPE
736,081 --> CARDINAL
2020 --> DATE
60th --> ORDINAL
Northern Canada --> ORG
second --> ORDINAL
the United States --> GPE
Juneau --> GPE
Sitka --> PERSON
Anchorage --> GPE
approximately half --> CARDINAL
Alaska --> GPE


In [10]:
spacy.explain('GPE')

'Countries, cities, states'

In [11]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [12]:
spacy.explain('ORDINAL')

'"first", "second", etc.'

In [13]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

Explain the NER in interactive way

In [14]:
from spacy import displacy

displacy.render(text, style= 'ent', jupyter=True)

Example

In [15]:
icc_text = "The Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6] The role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014. It has been claimed that the 2014 changes have handed control to the 'Big Three' nations of England, India and Australia.[7] The last ICC president was Zaheer Abbas,[8] who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015. When the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC."
icc_text

"The Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6] The role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014. It has been claimed that the 2014 changes have handed control to the 'Big Three' nations of England, India and Australia.[7] The last ICC president was Zaheer Abbas,[8] who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015. When the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC."

In [16]:
text = nlp(icc_text)

for ent in text.ents:
    print(ent.text,'-->',ent.label_)  


June 26, 2014 --> DATE
Narayanaswami Srinivasan --> PERSON
BCCI --> ORG
first --> ORDINAL
ICC --> ORG
ICC --> ORG
2014 --> DATE
2014 --> DATE
England --> GPE
India --> GPE
ICC --> ORG
Zaheer Abbas,[8 --> PERSON
June 2015 --> DATE
Mustafa Kamal --> PERSON
April 2015 --> DATE
ICC --> ORG
April 2016 --> DATE
Shashank Manohar --> PERSON
Srinivasan --> ORG
October 2015 --> DATE
first --> ORDINAL
ICC --> ORG


In [17]:
displacy.render(text, style= 'ent', jupyter=True)

In [18]:
count = 0
for ent in text.ents:
    if ent.label_ == 'PERSON':
        print(ent.text)
        count += 1
print(count)

Narayanaswami Srinivasan
Zaheer Abbas,[8
Mustafa Kamal
Shashank Manohar
4
