In [1]:
import spacy

### Checking spaCy NER settings

In [59]:
# Loading in standard English model
nlp = spacy.load('en_core_web_sm')

# Getting NER parser and printing labels
ner = nlp.get_pipe('ner')

for label in ner.labels:
    print("\t".join([label, str(spacy.explain(label))]))

CARDINAL	Numerals that do not fall under another type
DATE	Absolute or relative dates or periods
EVENT	Named hurricanes, battles, wars, sports events, etc.
FAC	Buildings, airports, highways, bridges, etc.
GPE	Countries, cities, states
LANGUAGE	Any named language
LAW	Named documents made into laws.
LOC	Non-GPE locations, mountain ranges, bodies of water
MONEY	Monetary values, including unit
NORP	Nationalities or religious or political groups
ORDINAL	"first", "second", etc.
ORG	Companies, agencies, institutions, etc.
PERCENT	Percentage, including "%"
PERSON	People, including fictional
PRODUCT	Objects, vehicles, foods, etc. (not services)
QUANTITY	Measurements, as of weight or distance
TIME	Times smaller than a day
WORK_OF_ART	Titles of books, songs, etc.


### Trying out spaCy NER with sample articles

In [65]:
sample_articles = [
    "../../data/processed/articles/test/texts/3.txt"
]

In [66]:
for article in sample_articles:
    with open(article, "rt", encoding="utf-8") as fp:
        text = fp.read().replace("\n", " ")
    doc = nlp(text)
    if doc.ents:
        distinct_entities = list()
        for entity in doc.ents:
            named_entity = (entity.text, entity.label_)
            if entity.label_ in ["ORG", "PERSON", "GPE", "PRODUCT", "EVENT", "WORK_OF_ART"] \
                and named_entity not in distinct_entities:
                distinct_entities.append(named_entity)

for entity in distinct_entities:
    print("\t".join([entity[0], entity[1], str(spacy.explain(entity[1]))]))

NIO	ORG	Companies, agencies, institutions, etc.
China	GPE	Countries, cities, states
NIO House  Thesis  NIO Inc.	ORG	Companies, agencies, institutions, etc.
the Chinese Communist Party	ORG	Companies, agencies, institutions, etc.
Tesla	PERSON	People, including fictional
BYD	ORG	Companies, agencies, institutions, etc.
OTCPK	ORG	Companies, agencies, institutions, etc.
BYDDY	ORG	Companies, agencies, institutions, etc.
XPeng	GPE	Countries, cities, states
Li Auto	PERSON	People, including fictional
Tesla	ORG	Companies, agencies, institutions, etc.
XPeng	ORG	Companies, agencies, institutions, etc.
Li	PERSON	People, including fictional
the Wuling Hong Guang	ORG	Companies, agencies, institutions, etc.
SAIC-GM-Wuling	ORG	Companies, agencies, institutions, etc.
ICE	ORG	Companies, agencies, institutions, etc.
the NIO ES6	ORG	Companies, agencies, institutions, etc.
UK	GPE	Countries, cities, states
V3	PRODUCT	Objects, vehicles, foods, etc. (not services)
NEDC	ORG	Companies, agencies, institutions, etc

In [67]:
distinct_entities

[('NIO', 'ORG'),
 ('China', 'GPE'),
 ('NIO House  Thesis  NIO Inc.', 'ORG'),
 ('the Chinese Communist Party', 'ORG'),
 ('Tesla', 'PERSON'),
 ('BYD', 'ORG'),
 ('OTCPK', 'ORG'),
 ('BYDDY', 'ORG'),
 ('XPeng', 'GPE'),
 ('Li Auto', 'PERSON'),
 ('Tesla', 'ORG'),
 ('XPeng', 'ORG'),
 ('Li', 'PERSON'),
 ('the Wuling Hong Guang', 'ORG'),
 ('SAIC-GM-Wuling', 'ORG'),
 ('ICE', 'ORG'),
 ('the NIO ES6', 'ORG'),
 ('UK', 'GPE'),
 ('V3', 'PRODUCT'),
 ('NEDC', 'ORG'),
 ('NIO ES8', 'ORG'),
 ('Shanghai', 'GPE'),
 ('GXC', 'ORG'),
 ('Rio Tinto', 'ORG'),
 ('RIO', 'ORG'),
 ('Columbia University', 'ORG'),
 ('US', 'GPE'),
 ('Conclusion  NIO', 'ORG'),
 ('EV', 'PRODUCT'),
 ('CCP', 'ORG')]