# Named Entity Recognition
To extract attributes from text like - `DATE`, `PERSON`, `LOCATION`

In [2]:
from pprint import pprint

import spacy
from spacy import displacy

# Download the pre-trained model
# spacy.cli.download("en_core_web_lg")
NER = spacy.load("en_core_web_lg")

def spacy_large_ner(document):
    return {(ent.text.strip(), ent.label_) for ent in NER(document).ents}


# Sample text from Wikipedia - Narendra Modi
text = """
Narendra Damodardas Modi (Gujarati: [ˈnəɾendɾə dɑmodəɾˈdɑs ˈmodiː] (listen); born 17 September 1950)[b] is an Indian politician serving as the 14th and current prime minister of India since May 2014. Modi was the chief minister of Gujarat from 2001 to 2014 and is the Member of Parliament from Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest serving prime minister from outside the Indian National Congress.

Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education. He was introduced to the RSS at age eight. He has reminisced about helping out after school at his father's tea stall at the Vadnagar railway station. At age 18, he was married to Jashodaben Chimanlal Modi, whom he abandoned soon after. He first publicly acknowledged her as his wife more than four decades later when required to do so by Indian law, but has made no contact with her since. Modi has asserted he had travelled in northern India for two years after leaving his parental home, visiting a number of religious centres, but few details of his travels have emerged. Upon his return to Gujarat in 1971, he became a full-time worker for the RSS. After the state of emergency was declared by Prime Minister Indira Gandhi in 1975, he went into hiding. The RSS assigned him to the BJP in 1985 and he held several positions within the party hierarchy until 2001, rising to the rank of general secretary.[c]
"""

doc = NER(text)
results = {(ent.text.strip(), ent.label_) for ent in doc.ents}
pprint(results)

displacy.render(doc, style="ent", jupyter=True)

{('14th', 'ORDINAL'),
 ('17 September', 'DATE'),
 ('1971', 'DATE'),
 ('1975', 'DATE'),
 ('1985', 'DATE'),
 ('2001', 'DATE'),
 ('2001 to', 'DATE'),
 ('2014', 'DATE'),
 ('BJP', 'ORG'),
 ('Gujarat', 'GPE'),
 ('Gujarati', 'NORP'),
 ('Hindu', 'NORP'),
 ('India', 'GPE'),
 ('Indian', 'NORP'),
 ('Indira Gandhi', 'PERSON'),
 ('Jashodaben Chimanlal', 'PERSON'),
 ('May 2014', 'DATE'),
 ('Modi', 'PERSON'),
 ('Narendra Damodardas Modi', 'PERSON'),
 ('Parliament', 'ORG'),
 ('RSS', 'ORG'),
 ('Vadnagar', 'GPE'),
 ('Varanasi', 'GPE'),
 ('age 18', 'DATE'),
 ('age eight', 'DATE'),
 ('first', 'ORDINAL'),
 ('more than four decades later', 'DATE'),
 ('the Bharatiya Janata Party', 'ORG'),
 ('the Indian National Congress', 'ORG'),
 ('the Rashtriya Swayamsevak Sangh', 'ORG'),
 ('two years', 'DATE'),
 ('ˈnəɾendɾə dɑmodəɾˈdɑs ˈmodiː', 'PERSON')}
