### The nlp object

In [2]:
# Import spaCy
import spacy

# Create a blank English nlp object
nlp = spacy.blank("en")

### The Doc object

In [3]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


### The Token object

In [4]:
doc = nlp("Hello world!")

# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

world


### The Span object

In [5]:
doc = nlp("Hello world!")

# A slice from the Doc is a Span object
span = doc[1:3]

# Get the span text via the .text attribute
print(span.text)

world!


### Lexical Attributes

In [6]:
doc = nlp("It costs $5.")

print ("Index: ", [token.i for token in doc])
print("Text: ", [token.text for token in doc] )

Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'costs', '$', '5', '.']


In [7]:
print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct", [token.is_punct for token in doc])
print("like_num: ", [token.like_num for token in doc])

is_alpha: [True, True, False, False, False]
is_punct [False, False, False, False, True]
like_num:  [False, False, False, True, False]


### Pipeline Packages

In [8]:
#!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

### Predicting Part-of-speech Tags

In [2]:
# Load the small English pipeline
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech 
    # Attributes that return strings usually end with an underscore – attributes without the underscore return an integer ID value.
    print(token.text, token.pos)
    print(token.text, token.pos_)

She 95
She PRON
ate 100
ate VERB
the 90
the DET
pizza 92
pizza NOUN


### Predicting Syntactic Dependencies

In [3]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


### Predicting Named Entities

In [4]:
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


### Tip: the spacy.explain method

In [5]:
spacy.explain("GPE")

'Countries, cities, states'

In [6]:
spacy.explain("NNP")

'noun, proper singular'

In [7]:
spacy.explain("dobj")

'direct object'

### Match patterns

In [10]:
# Lists of dictionaries, one per token
# Match exact token texts
[{"TEXT": "iPhone"}, {"TEXT": "X"}]

# Match lexical attributes
[{"LOWER": "iphone"}, {"LOWER": "x"}]

# Match any token attributes
[{"LEMMA": "buy"}, {"POS": "NOUN"}]


# Using the Matcher #

# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)


iPhone X


### Matching lexical attributes

In [15]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

doc = nlp("2018 FIFA World Cup: France won!")



2018 FIFA World Cup: France won!


### Matching other token attributes

In [16]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

doc = nlp("I loved dogs but now I love cats more.")

### Using operators and quantifiers

In [17]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

doc = nlp("I bought a smartphone. Now I'm buying apps.")
