In [1]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
spacenews_doc = nlp(u"SpaceNews is a print and digital publication that covers business \
and political news in the space and satellite industry. Space-News provides news, \
commentary and analysis to an audience of government officials, politicians and \
executives within the space industry. Space News details topics in civil, military \
and commercial space and the satellite communications business.")

In [4]:
for i, sentence in enumerate(spacenews_doc.sents):
    print(f"{i+1}: {sentence}")

1: SpaceNews is a print and digital publication that covers business and political news in the space and satellite industry.
2: Space-News provides news, commentary and analysis to an audience of government officials, politicians and executives within the space industry.
3: Space News details topics in civil, military and commercial space and the satellite communications business.


### Building a Matcher Object

In [5]:
matcher = Matcher(nlp.vocab)

Creating a list of patterns to identify the target

In [6]:
patterns = [
    [{"LOWER":'spacenews'}],
    [{"LOWER": 'space'}, {"IS_PUNCT": True}, {"LOWER": 'news'}],
    [{"LOWER": 'space'}, {"LOWER": 'news'}]
]

In [7]:
matcher.add(key='SpaceNews', patterns=patterns)

In [8]:
found_matches = matcher(spacenews_doc)

The found_matches variable contains a tuple for every match found within the doc object. Within each tuple, there is a match id, a start token, and an end location that map the token within the doc.

In [9]:
found_matches

[(10501091333728194545, 0, 1),
 (10501091333728194545, 20, 23),
 (10501091333728194545, 44, 46)]

In [10]:
for match_id, start, end in found_matches:
    m_id = nlp.vocab.strings[match_id]
    print(f"Original word: {spacenews_doc[start:end]}\nMatch id: {m_id}\n")

Original word: SpaceNews
Match id: SpaceNews

Original word: Space-News
Match id: SpaceNews

Original word: Space News
Match id: SpaceNews



### Adding a Named Entity to a Span

In [11]:
tesla_doc = nlp(u"Tesla delivered 185,000 cars in the first quarter, twice as many as a year ago.")

In [12]:
matcher = Matcher(nlp.vocab)

In [13]:
patterns = [
    [{"LOWER": 'tesla'}]
]

In [14]:
matcher.add('Tesla', patterns=patterns)

In [15]:
found_matches = matcher(tesla_doc)

In [16]:
_, start, end = found_matches[0]

In [17]:
start, end

(0, 1)

We don't see Tesla in here as a recognized entity. We can manually add in an entity into a matcher object.

In [18]:
for ent in tesla_doc.ents:
    print(f"{ent.text:{20}} {ent.label_:{10}} {spacy.explain(ent.label_)}")

185,000              CARDINAL   Numerals that do not fall under another type
the first quarter    DATE       Absolute or relative dates or periods
as many as           CARDINAL   Numerals that do not fall under another type
a year ago           DATE       Absolute or relative dates or periods


In [19]:
ORG = tesla_doc.vocab.strings[u"ORG"]

In [20]:
ORG

383

In [21]:
new_entity = Span(tesla_doc, start, end, label=ORG)

In [22]:
new_entity

Tesla

In [23]:
list(tesla_doc.ents)

[185,000, the first quarter, as many as, a year ago]

In [24]:
tesla_doc.ents = list(tesla_doc.ents) + [new_entity]

In [25]:
tesla_doc.ents

(Tesla, 185,000, the first quarter, as many as, a year ago)

In [26]:
displacy.render(tesla_doc, style='ent', jupyter=True)

We see that by manually adding in "Tesla" as an entity, displacy is able to interpret it as an ORG entity.

### Adding Named Entities to All Matching Spans

In [39]:
tesla_doc = nlp(u"Tesla delivered 185,000 cars in the first quarter, twice as \
many as a year ago. Tesla makes cars that are fully electric - the factory \
version of the Model 3 car is around $37,000.")

In [40]:
tesla_doc.ents

(185,000,
 the first quarter,
 as many as,
 a year ago,
 Tesla,
 Model 3,
 around $37,000)

In [41]:
displacy.render(tesla_doc, style='ent', jupyter=True)