In [1]:
import spacy

texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 million', 'MONEY')]
[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]


In [2]:
import spacy

def my_component(doc):
    print("After tokenization, this doc has {} tokens.".format(len(doc)))
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(my_component, name="print_info", last=True)
print(nlp.pipe_names)  # ['tagger', 'parser', 'ner', 'print_info']
doc = nlp("This is a sentence.")

['tagger', 'parser', 'ner', 'print_info']
After tokenization, this doc has 5 tokens.
The part-of-speech tags are: ['DET', 'AUX', 'DET', 'NOUN', 'PUNCT']
This is a pretty short document.


In [3]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

class EntityMatcher(object):
    name = "entity_matcher"

    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc

nlp = spacy.load("en_core_web_sm")
terms = ("cat", "dog", "tree kangaroo", "giant sea spider")
entity_matcher = EntityMatcher(nlp, terms, "ANIMAL")

nlp.add_pipe(entity_matcher, after="ner")

print(nlp.pipe_names)  # The components in the pipeline

doc = nlp("This is a text about Barack Obama and a tree kangaroo")
print([(ent.text, ent.label_) for ent in doc.ents])

['tagger', 'parser', 'ner', 'entity_matcher']
[('Barack Obama', 'LOC'), ('tree kangaroo', 'ANIMAL')]


In [4]:
#custom sentence segmentation
import spacy

def custom_sentencizer(doc):
    for i, token in enumerate(doc[:-2]):
        # Define sentence start if pipe + titlecase token
        if token.text == "|" and doc[i+1].is_title:
            doc[i+1].is_sent_start = True
        else:
            # Explicitly set sentence start to False otherwise, to tell
            # the parser to leave those tokens alone
            doc[i+1].is_sent_start = False
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(custom_sentencizer, before="parser")  # Insert before the parser
doc = nlp("This is. A sentence. | This is. Another sentence.")
for sent in doc.sents:
    print(sent.text)

This is. A sentence. |
This is. Another sentence.


In [0]:
from spacy.tokens import Doc, Span, Token

fruits = ["apple", "pear", "banana", "orange", "strawberry"]
is_fruit_getter = lambda token: token.text in fruits
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])

Token.set_extension("is_fruit", getter=is_fruit_getter)
Doc.set_extension("has_fruit", getter=has_fruit_getter)
Span.set_extension("has_fruit", getter=has_fruit_getter)

In [0]:
doc = nlp("I have an apple and a melon")
assert doc[3]._.is_fruit   # get Token attributes
assert not doc[0]._.is_fruit
assert doc._.has_fruit        # get Doc attributes
assert doc[1:4]._.has_fruit   # get Span attributes