# The purpose of this notebook is to demonstrate the value of modular pipelines and interact with them

In [1]:
import spacy
from spacy import displacy

# First we will set up a default pipeline -- notice there are no arguments when calling load()....

In [2]:
default_nlp = spacy.load("en_core_web_sm")

In [3]:
for pipe in default_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7f374e66a390>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7f37170cc528>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f37170cc588>)


# Now set up a pipeline where some steps are not enabled

In [4]:
simpler_nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])

In [5]:
for pipe in simpler_nlp.pipeline:
    print(pipe)

('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f37126a21c8>)


# Which will be faster?  Let's experiment...

In [6]:
example_text = "There is slight enlargement of the spleen. No history of a heart murmur."

In [7]:
MAX_DOCUMENTS = 10

document_set = []
for i in range(MAX_DOCUMENTS):
    # add this document in N times...
    document_set.append(example_text)
    
print('Size of our document set : {}'.format(len(document_set)))

Size of our document set : 10


In [8]:
%%time

for text in document_set:
    default_nlp(text)

CPU times: user 181 ms, sys: 1.19 ms, total: 183 ms
Wall time: 181 ms


In [9]:
%%time

for text in document_set:
    simpler_nlp(text)

CPU times: user 88.8 ms, sys: 598 µs, total: 89.4 ms
Wall time: 87 ms


# Let's pause for a moment and try what we did above but instead of running the pipeline over 10 documents, let's run it over 1000 documents

In [10]:
MAX_DOCUMENTS = CHANGE_ME

larger_document_set = []
for i in range(MAX_DOCUMENTS):
    # add this document in N times...
    larger_document_set.append(example_text)
    
print('Size of our larger document set : {}'.format(len(larger_document_set)))

NameError: name 'CHANGE_ME' is not defined

In [None]:
%%time

for text in larger_document_set:
    default_nlp(text)

In [None]:
%%time

for text in larger_document_set:
    default_nlp(text)

# Since the steps of a pipeline are modular, let's change the order

In [None]:
simple_text = u"This is a sentence."

In [None]:
def my_component(doc):
    print("After tokenization, this doc has {} tokens.".format(len(doc)))
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

In [None]:
custom_pipeline = spacy.load("en_core_web_sm")

custom_pipeline.add_pipe(my_component, name="print_info", last=True)

print(custom_pipeline.pipe_names)

doc = custom_pipeline(simple_text)

# What happens when we try to print part of speech tags as the first step in the pipeline?  Change the code below to run the "print_info" component as the first component instead of the last

In [None]:
print_first_pipeline = spacy.load("en_core_web_sm")

print_first_pipeline.add_pipe(my_component, name="print_info", first = CHANGE_ME)

print(print_first_pipeline.pipe_names)

doc = print_first_pipeline(simple_text)

# Let's look at the components in this pipeline again.  Change the code below so that for each of the pipelines above (custom_pipeline and print_first_pipeline) we write out their steps and the order they are executed

In [None]:
for pipe in CHANGE_ME.pipeline:
    print(pipe)
    
for pipe in CHANGE_ME_TOO.pipeline:
    print(pipe)

# How many steps do they each have?  What is the difference between them?