In [None]:
import spacy

In [None]:
# !python3 -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
sent = nlp('''Mark Zuckerberg will meet Aditya Joshi
on Monday 6th June 2024, 56/12/56 for $3 Trillion deal
at Mumbai tomorrow''')
sent

Mark Zuckerberg will meet Aditya Joshi
on Monday 6th June 2024, 56/12/56 for $3 Trillion deal 
at Mumbai tomorrow

In [None]:
sent.ents #if you write j small - it will classify it as Unknown in POS tagging and won't be able to catch it as a Proper Noun - Proper Nouns in a sentence have first letter capital

(Mark Zuckerberg,
 Aditya Joshi,
 Monday 6th June 2024,
 56/12/56,
 $3 Trillion,
 Mumbai,
 tomorrow)

In [None]:
for ent in sent.ents:
  print(ent,"-->", ent.label_)

Mark Zuckerberg --> PERSON
Aditya Joshi --> PERSON
Monday 6th June 2024 --> DATE
56/12/56 --> CARDINAL
$3 Trillion --> MONEY
Mumbai --> GPE
tomorrow --> DATE


In [None]:
raw_text = '''Alaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world.
 It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continent's most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland.[6]
  The state contains the second-largest and largest cities in the United States by area: the state capital of Juneau, and its former capital, Sitka, respectively.
  The state's most populous city is Anchorage and approximately half of Alaska's residents live within its metropolitan area. '''
raw_text

"Alaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world.\n It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continent's most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland.[6]\n  The state contains the second-largest and largest cities in the United States by area: the state capital of Juneau, and its former capital, Sitka, respectively. \n  The state's most populous city is Anchorage and approximately half of Alaska's residents live within its metropolitan area. "

In [None]:
sent1 = nlp(raw_text)

In [None]:
for ent in sent1.ents:
  print(ent.text,"-->", ent.label_)

Alaska --> GPE
U.S. --> GPE
three --> CARDINAL
Texas --> GPE
California --> GPE
Montana --> GPE
seventh --> ORDINAL
third --> ORDINAL
U.S. --> GPE
736,081 --> CARDINAL
2020 --> DATE
60th --> ORDINAL
Northern Canada --> ORG
second --> ORDINAL
the United States --> GPE
Juneau --> GPE
Sitka --> PERSON
Anchorage --> GPE
approximately half --> CARDINAL
Alaska --> GPE


In [None]:
spacy.explain('GPE')

'Countries, cities, states'

In [None]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [None]:
spacy.explain('ORDINAL')

'"first", "second", etc.'

In [None]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [None]:
spacy.explain('PERSON')

'People, including fictional'

In [None]:
spacy.explain('DATE')

'Absolute or relative dates or periods'

In [None]:
from spacy import displacy
displacy.render(sent,style="ent", jupyter=True)

In [None]:
displacy.app

In [None]:
from spacy import displacy
displacy.render(sent1,style="ent", jupyter=True)

In [None]:
icc_sent = '''The Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6]
The role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014.
It has been claimed that the 2014 changes have handed control to the 'Big Three' nations of England, India and Australia.[7]
The last ICC president was Zaheer Abbas, [8] who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015.
When the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC. [9]
'''
icc_sent

"The Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6] \nThe role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014.\nIt has been claimed that the 2014 changes have handed control to the 'Big Three' nations of England, India and Australia.[7] \nThe last ICC president was Zaheer Abbas, [8] who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015. \nWhen the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC. [9]\n"

In [None]:
sent1 = nlp(icc_sent)

In [None]:
for ent in sent1.ents:
  print(ent.text,"-->", ent.label_)

June 26, 2014 --> DATE
Narayanaswami Srinivasan --> PERSON
BCCI --> ORG
first --> ORDINAL
ICC --> ORG
ICC --> ORG
2014 --> DATE
2014 --> DATE
England --> GPE
India --> GPE
ICC --> ORG
Zaheer Abbas --> PERSON
8 --> CARDINAL
June 2015 --> DATE
Mustafa Kamal --> PERSON
April 2015 --> DATE
ICC --> ORG
April 2016 --> DATE
Shashank Manohar --> PERSON
Srinivasan --> ORG
October 2015 --> DATE
first --> ORDINAL
ICC --> ORG
9 --> CARDINAL


In [None]:
icc_names = [i.text for i in sent1.ents if i.label_ == "PERSON"]
icc_names

['Narayanaswami Srinivasan',
 'Zaheer Abbas',
 'Mustafa Kamal',
 'Shashank Manohar']