# Processing pipelines

In [17]:
import spacy
from spacy.language import Language
import pprint

nlp = spacy.blank("en")

In [2]:
print(nlp.pipe_names)

[]


In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [15]:
pprint.pprint(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fedeb88edc0>),
 ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fedeb88ee20>),
 ('parser',
  <spacy.pipeline.dep_parser.DependencyParser object at 0x7fee2f745660>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fedeb460780>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fedec4016c0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fee2f7459e0>)]


### Adding a custom component

In [18]:
# Create the nlp object
nlp = spacy.load("en_core_web_sm")

# Define a custom component
@Language.component("custom_component")
def custom_component_function(doc):
    # Print the doc's length
    print("Doc length:", len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe("custom_component", first=True)

# Print the pipeline component names
print("Pipeline:", nlp.pipe_names)

Pipeline: ['custom_component', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [19]:
# process text with custom component
doc = nlp("Hello world!")

Doc length: 3


### Setting custom attributes

In [23]:
nlp = spacy.load("en_core_web_sm")

In [20]:
# Import global classes
from spacy.tokens import Doc, Token, Span

# Set extensions on the Doc, Token and Span
Doc.set_extension("title", default=None)
Token.set_extension("is_color", default=False)
Span.set_extension("has_color", default=False)

**Extension attribute types**

1. Attribute extensions
2. Property extensions
3. Method extensions

#### Attribute extensions

Attribute extensions set a default value that can be overwritten.

In [24]:
from spacy.tokens import Token

# Set extension on the Token with default value
Token.set_extension("is_color", default=False, force=True)

doc = nlp("The sky is blue.")

# Overwrite extension attribute value
doc[3]._.is_color = True

#### Property extensions

Property extensions work like properties in Python: they can define a getter function and an optional setter.

The getter function is only called when you retrieve the attribute. This lets you compute the value dynamically, and even take other custom attributes into account.

In [26]:
from spacy.tokens import Token

# Define getter function
def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors

# Set extension on the Token with getter
Token.set_extension("is_color", getter=get_is_color, force=True)

doc = nlp("The sky is blue.")
print(doc[3]._.is_color, "-", doc[3].text)

True - blue


In [28]:
### SPAN EXTENSION
from spacy.tokens import Span

# Define getter function
def get_has_color(span):
    colors = ["red", "yellow", "blue"]
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.set_extension("has_color", getter=get_has_color, force=True)

doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, "-", doc[1:4].text)
print(doc[0:2]._.has_color, "-", doc[0:2].text)

True - sky is blue
False - The sky


#### Method extensions

Method extensions make the extension attribute a callable method.

You can then pass one or more arguments to it, and compute attribute values dynamically – for example, based on a certain argument or setting.

In [29]:
from spacy.tokens import Doc

# Define method with arguments
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

# Set extension on the Doc with method
Doc.set_extension("has_token", method=has_token)

doc = nlp("The sky is blue.")
print(doc._.has_token("blue"), "- blue")
print(doc._.has_token("cloud"), "- cloud")

True - blue
False - cloud


## Scaling and performance

If you need to process a lot of texts and create a lot of `Doc` objects in a row, the `nlp.pipe` method can speed this up significantly.

It processes the texts as a stream and yields `Doc` objects.

It is much faster than just calling nlp on each text, because it batches up the texts.

`nlp.pipe` is a generator that yields `Doc` objects, so in order to get a list of docs, remember to call the list method around it.

**BAD:**

In [None]:
docs = [nlp(text) for text in LOTS_OF_TEXTS]

**GOOD:**

In [None]:
docs = list(nlp.pipe(LOTS_OF_TEXTS))

### Passing in context

1. Setting `as_tuples=True` on `nlp.pipe` lets you pass in `(text, context)` tuples
2. Yields `(doc, context)` tuples
3. Useful for associating metadata with the doc

In [30]:
nlp = spacy.blank("en")

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

This is a text 15
And another text 16


Context can also be internalized as attributes of Docs.

In [31]:
from spacy.tokens import Doc

Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

### Using only the tokenizer

If you only need a tokenized `Doc` object (but not other attributes), you can use the `nlp.make_doc` method instead, which takes a text and returns a doc.

In [32]:
doc = nlp.make_doc("Hello world!")

### Disabling pipeline components

In [36]:
nlp = spacy.load("en_core_web_sm")

text = "This is a text"

# Disable tagger and parser
with nlp.select_pipes(disable=["tagger", "parser"]):
    # Process the text and print the entities
    doc = nlp(text)
    print(doc.ents)

()




In [None]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
processed_texts = list(nlp.pipe(TEXTS))
for doc in processed_texts:
  print([token.text for token in doc if token.pos_ == "ADJ"])

In [None]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = nlp.pipe(TEXTS)
entities = [doc.ents for doc in docs]
print(*entities)

In [None]:
import spacy

nlp = spacy.blank("en")

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
patterns = nlp.pipe(people)