In [1]:
# Import spaCy
import spacy

# Introduction

### The nlp object

- contains the processing pipeline
- includes language-specific rules for tokenization etc.

In [2]:
# Create a blank English nlp object
nlp = spacy.blank("en")

### The Doc object

Created by processing a string of text with the nlp object.

In [3]:
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


### The Token object

In [4]:
# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

world


### The Span object

A slice from the Doc is a Span object.

In [5]:
span = doc[1:3]

# Get the span text via the .text attribute
print(span.text)

world!


In [6]:
print(f"{span=}")

span=world!


### Lexical attributes

In [7]:
doc = nlp("It costs $5 (five dollars).")

print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])

print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4, 5, 6, 7, 8]
Text:     ['It', 'costs', '$', '5', '(', 'five', 'dollars', ')', '.']
is_alpha: [True, True, False, False, False, True, True, False, False]
is_punct: [False, False, False, False, True, False, False, True, True]
like_num: [False, False, False, True, False, True, False, False, False]


# Trained pipelines

In [None]:
# run to get trained pipelines for English
# !python -m spacy download en_core_web_sm

What is comprised?

- Binary weights
- Vocabulary
- Meta information
- Configuration file

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")

### Predicting Part-of-speech Tags

In [21]:
# Process a text
doc = nlp("She ate the pizza.")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_, spacy.explain(token.pos_))

She PRON pronoun
ate VERB verb
the DET determiner
pizza NOUN noun
. PUNCT punctuation


### Predicting Syntactic Dependencies

In [23]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, spacy.explain(token.dep_))

She PRON nsubj ate nominal subject
ate VERB ROOT ate root
the DET det pizza determiner
pizza NOUN dobj ate direct object
. PUNCT punct ate punctuation


### Predicting Named Entities

In [24]:
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_, spacy.explain(ent.label_))

Apple ORG Companies, agencies, institutions, etc.
U.K. GPE Countries, cities, states
$1 billion MONEY Monetary values, including unit


## Matchers

In [25]:
import spacy

# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

In [27]:
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


### Matching lexical attributes

In [30]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
matcher = Matcher(nlp.vocab)
matcher.add("world_cup_pattern", [pattern])
doc = nlp("2018 FIFA World Cup: France won!")

matches = matcher(doc)
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


### Matching other token attributes

In [31]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

matcher = Matcher(nlp.vocab)
matcher.add("loving_something_pattern", [pattern])

doc = nlp("I loved dogs but now I love cats more.")

matches = matcher(doc)
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

loved dogs
love cats


### Using operators and quantifiers

- {"OP": "!"}: 	Negation: match 0 times
- {"OP": "?"}: 	Optional: match 0 or 1 times
- {"OP": "+"}: 	Match 1 or more times
- {"OP": "*"}: 	Match 0 or more times

In [33]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

matcher = Matcher(nlp.vocab)
matcher.add("buying_something_possibly_with_determiner", [pattern])

doc = nlp("I bought a smartphone. Now I'm buying apps.")

matches = matcher(doc)
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

bought a smartphone
buying apps
