# NAME ENTITY RECOGNISTION USING SPACY

In [1]:
import spacy

In [2]:
# There are various spaCy models for different languages. 
# The default model for the English language is designated as en_core_web_sm
nlp = spacy.load("en_core_web_sm") #The load() function returns a Language callable object, which is commonly assigned to a variable called nlp.

"""
we load the English language model (en_core_web_sm) and process a text using nlp().
The resulting Doc object contains a sequence of tokens, which can be accessed using a loop. 
and each token is printed individually.
"""

'\nwe load the English language model (en_core_web_sm) and process a text using nlp().\nThe resulting Doc object contains a sequence of tokens, which can be accessed using a loop. \nand each token is printed individually.\n'

In [3]:
# shows the components in nlp pipe
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [4]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")

# doc.ents property refers to the named entities identified in a given 'doc' object
# Named entities are specific pieces of information in text that can be categorized 
# into predefined classes such as person names, organizations, locations, dates, and more

# The doc.ents property returns a tuple of Span objects, where each Span represents an identified named entity in the document.

# Each Span object has attributes like text (the entity's text)
# start (the start index of the entity in the document),
# end (the end index of the entity in the document)
# and label_ (the label or category of the entity

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))


Tesla Inc | PERSON | People, including fictional
Twitter Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [5]:
# the same can be identified using displacy

from spacy import displacy

displacy.render(doc, style="ent")

In [6]:
# to print all the entities

nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [7]:
doc2 = nlp("Tesla is going to acquire Twitter for $45 billion")

for ent in doc2.ents:
    print(ent.text, "|", ent.label_)

Tesla | ORG
Twitter | PRODUCT
$45 billion | MONEY


In [8]:
# span is class in spacy i.e. span of tokens
# tokens refer to the individual units or segments into which a text is divided.
# Tokenization is the process of splitting a text into these smaller units, which are typically words, but can also be punctuation marks, numbers, or other meaningful components.

# import span class from spacy

# correct the results of the named entities

from spacy.tokens import Span

s1 = Span(doc, 5,6, label="ORG")


doc2.set_ents([s1], default="unmodified")


In [9]:
for ent in doc2.ents:
    print(ent.text, "|", ent.label_)

Tesla | ORG
Twitter | ORG
$45 billion | MONEY
