In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [4]:
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [5]:
#Noun chunks
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [6]:
#Navigating parse tree
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability]
insurance compound liability NOUN []
liability dobj shift VERB [insurance, toward]
toward prep liability NOUN [manufacturers]
manufacturers pobj toward ADP []


In [12]:
doc = nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

['bright', 'red']
['on']
2
1


In [13]:
doc = nlp("Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['account', 'holders', 'submit']
and cc 0 0 ['Credit', 'account', 'holders', 'submit']
mortgage conj 0 0 ['Credit', 'account', 'holders', 'submit']
account compound 1 0 ['holders', 'submit']
holders nsubj 1 0 ['submit']


In [14]:
#Named entity recognition
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [15]:
#Accessing entity annotations
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


In [16]:
doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)

Before []


In [0]:
from spacy.tokens import Span
fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]

In [19]:
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)

After [('fb', 0, 2, 'ORG')]


In [0]:
#training and updating
train_data = [
    ("Who is Chaka Khan?", [(7, 17, "PERSON")]),
    ("I like London and Berlin.", [(7, 13, "LOC"), (18, 24, "LOC")]),
]

In [25]:
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
doc = nlp(text)
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

In [26]:
doc = nlp("Ada Lovelace was born in London")
# document level
ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
print(ents)  # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')]

[('Ada Lovelace', 'PERSON', ''), ('London', 'GPE', '')]


In [27]:
# token level
ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
print(ent_ada_0)  # ['Ada', 'PERSON', 'Q7259']
print(ent_ada_1)  # ['Lovelace', 'PERSON', 'Q7259']
print(ent_london_5)  # ['London', 'GPE', 'Q84']

['Ada', 'PERSON', '']
['Lovelace', 'PERSON', '']
['London', 'GPE', '']


In [28]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [29]:
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

['gimme', 'that']


In [30]:
from spacy.symbols import ORTH
# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

['gim', 'me', 'that']


In [32]:
from spacy.tokens import Doc
#custom whitespace tokenizer
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])

["What's", 'happened', 'to', 'me?', 'he', 'thought.', 'It', "wasn't", 'a', 'dream.']


In [33]:
from spacy.lang.en import English

nlp = English()
doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
          spaces=[False, True, False, False])
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])

[('Hello', 'Hello', ''), (',', ', ', ' '), ('world', 'world', ''), ('!', '!', '')]


In [34]:
from spacy.gold import align

other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
print("Misaligned tokens:", cost)  # 2
print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, 5, 6, 7])
print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
print("Many-to-one mappings b-> a", b2a_multi)  # {}

Misaligned tokens: 2
One-to-one mappings a -> b [ 0  1  2  3 -1 -1  5  6]
One-to-one mappings b -> a [0 1 2 3 5 6 7]
Many-to-one mappings a -> b {4: 4, 5: 4}
Many-to-one mappings b-> a {}


In [35]:
doc = nlp("I live in New York")
print("Before:", [token.text for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new york"})
print("After:", [token.text for token in doc])

Before: ['I', 'live', 'in', 'New', 'York']
After: ['I', 'live', 'in', 'New York']


In [37]:
from spacy.tokens import Token
# Register a custom token attribute, token._.is_musician
Token.set_extension("is_musician", default=False)

nlp = spacy.load("en_core_web_sm")
doc = nlp("I like David Bowie")
print("Before:", [(token.text, token._.is_musician) for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[2:4], attrs={"_": {"is_musician": True}})
print("After:", [(token.text, token._.is_musician) for token in doc])

Before: [('I', False), ('like', False), ('David', False), ('Bowie', False)]
After: [('I', False), ('like', False), ('David Bowie', True)]


In [38]:
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [39]:
nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [40]:
#custom rule based strategy
text = "this is a sentence...hello...and another sentence."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print("Before:", [sent.text for sent in doc.sents])

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")
doc = nlp(text)
print("After:", [sent.text for sent in doc.sents])

Before: ['this is a sentence...', 'hello...and another sentence.']
After: ['this is a sentence...', 'hello...', 'and another sentence.']
