In [1]:
# import others
import pandas as pd
# from tabulate import tabulate

# import spacy
import spacy
from spacy import displacy
# load english default model, model download is required
nlp = spacy.load('en')


# spacy model architecture
1. spacy doc object (container) is the entry point of the spacy API. It is constructed after passing raw text into a spacy model. <br />
NOTE: a spacy model is a pipeline of functions (tokenizer --> tagger --> parser --> NER --> ...), whose output is a doc object. A pipeline is made of components, different components is responsible to add different object attributes (linguistic features) to the doc container 
2. there are two big category of classes in spacy: Container Objects and Processing Pipelines
3. most linguistic features can be accessed via container objects, there are four container objects <br />
    i. doc: sequence of tokens <br />
    ii. token: individual tokens, like a work, punctuation symbol, space <br />
    iii. span: a slice of doc, like a sentence, a noun chunk <br />
    iv. lexeme: an entry in the vocabulary <br />
4. Like many NLP libraries, spaCy encodes all strings to hash values to reduce memory usage and improve efficiency,
    to see string representation, use method call with suffix '\_', like, 'pos\_' instead of 'pos'
5. API details: https://spacy.io/api/doc

# basic navigation of spacy API

In [2]:
# create spacy entry point - doc object
raw_text = u"Autonomous cars shift insurance liability toward manufacturers"
doc = nlp(raw_text)

# 1. doc object
print(doc.__class__)
# 2. token object -- individual element of a doc
print(doc[0].__class__)
# 3. span object -- a slice of a doc
print(list(doc.noun_chunks)[0].__class__)  # a noun chunk
print(list(doc.sents)[0].__class__)  # a sentence
# 4. lexeme
print('first lexeme repr in spacy vocab is: {}'.format(list(doc.vocab)[0].text))
print(list(doc.vocab)[0].__class__)

<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.span.Span'>
first lexeme repr in spacy vocab is: convincing
<class 'spacy.lexeme.Lexeme'>


# linguistic Features

### 1. POS tagging

In [3]:
# get token's linguistic features (pos / dep / lemma / ... at token's level)
data = {}
for token in doc:
    data.setdefault('text', []).append(token.text)
    data.setdefault('lemma', []).append(token.lemma_)
    data.setdefault('pos', []).append(token.pos_)
    data.setdefault('tag', []).append(token.tag_)
    data.setdefault('dep', []).append(token.dep_)
    data.setdefault('shape', []).append(token.shape_)
    data.setdefault('is_alpha', []).append(token.is_alpha)
    data.setdefault('is_stop', []).append(token.is_stop)

pd.DataFrame(data)

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Autonomous,autonomous,ADJ,JJ,amod,Xxxxx,True,False
1,cars,car,NOUN,NNS,nsubj,xxxx,True,False
2,shift,shift,VERB,VBP,ROOT,xxxx,True,False
3,insurance,insurance,NOUN,NN,compound,xxxx,True,False
4,liability,liability,NOUN,NN,dobj,xxxx,True,False
5,toward,toward,ADP,IN,prep,xxxx,True,True
6,manufacturers,manufacturer,NOUN,NNS,pobj,xxxx,True,False


#### linguistic features

Text: The original word text. <br />
Lemma: The base form of the word. <br />
POS: The simple part-of-speech tag. <br />
Tag: The detailed part-of-speech tag. <br />
Dep: Syntactic dependency, i.e. the relation between tokens. <br />
Shape: The word shape – capitalisation, punctuation, digits. <br />
is alpha: Is the token an alpha character? <br />
is stop: Is the token part of a stop list, i.e. the most common words of the language? <br />

In [4]:
# don't understand what does 'amod' mean?
spacy.explain('amod')

'adjectival modifier'

### 2. dependency parsing

In [5]:
# 1. Noun Chunks
data = {}
for chunk in doc.noun_chunks:
    data.setdefault('text', []).append(chunk.text)
    data.setdefault('root text', []).append(chunk.root.text)
    data.setdefault('root dep', []).append(chunk.root.dep_)
    data.setdefault('explain', []).append(spacy.explain(chunk.root.dep_))
    data.setdefault('root head text', []).append(chunk.root.head.text)

pd.DataFrame(data)

Unnamed: 0,text,root text,root dep,explain,root head text
0,Autonomous cars,cars,nsubj,nominal subject,shift
1,insurance liability,liability,dobj,direct object,shift
2,manufacturers,manufacturers,pobj,object of preposition,toward


#### Noun Chunks

Text: The original noun chunk text. <br />
Root text: The original text of the word connecting the noun chunk to the rest of the parse. <br />
Root dep: Dependency relation connecting the root to its head. <br />
Root head text: The text of the root token's head. <br />

In [6]:
# 2. Navigating the parse tree
data = {}
for token in doc:
    data.setdefault('text', []).append(token.text)
    data.setdefault('dep', []).append(token.dep_)
    data.setdefault('explain', []).append(spacy.explain(token.dep_))
    data.setdefault('head text', []).append(token.head.text)
    data.setdefault('head pos', []).append(token.head.pos_)
    data.setdefault('children', []).append([child for child in token.children])

pd.DataFrame(data)

Unnamed: 0,text,dep,explain,head text,head pos,children
0,Autonomous,amod,adjectival modifier,cars,NOUN,[]
1,cars,nsubj,nominal subject,shift,VERB,[Autonomous]
2,shift,ROOT,,shift,VERB,"[cars, liability, toward]"
3,insurance,compound,,liability,NOUN,[]
4,liability,dobj,direct object,shift,VERB,[insurance]
5,toward,prep,prepositional modifier,shift,VERB,[manufacturers]
6,manufacturers,pobj,object of preposition,toward,ADP,[]


#### parse tree

Text: The original token text. <br />
Dep: The syntactic relation connecting child to head. <br />
Head text: The original text of the token head. <br />
Head POS: The part-of-speech tag of the token head. <br />
Children: The immediate syntactic dependents of the token. <br />

In [7]:
# 3. Iterate local sub tree
# interest: find verbs that has a subject
from spacy.symbols import nsubj, VERB
verbs = set()
for possible_subject in doc:
    # make sure subject is the nominal subject and its head is a Verb
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        # add its head, which is a Verb, to verbs
        verbs.add(possible_subject.head)
print('out interest: {}'.format(verbs))

# iterate our interest's subtree
interest = verbs.pop()
print(interest.__class__)
data = {}
for descendant in interest.subtree:
    data.setdefault('text', []).append(descendant.text)
    data.setdefault('dep', []).append(descendant.dep_)
    data.setdefault('explain', []).append(spacy.explain(descendant.dep_))
    data.setdefault('n_lefts', []).append(descendant.n_lefts)
    data.setdefault('lefts', []).append(list(descendant.lefts))
    data.setdefault('n_rights', []).append(descendant.n_rights)
    data.setdefault('rights', []).append(list(descendant.rights))
    data.setdefault('ancestor', []).append([ancestor.text for ancestor in descendant.ancestors])

pd.DataFrame(data)

out interest: {shift}
<class 'spacy.tokens.token.Token'>


Unnamed: 0,text,dep,explain,n_lefts,lefts,n_rights,rights,ancestor
0,Autonomous,amod,adjectival modifier,0,[],0,[],"[cars, shift]"
1,cars,nsubj,nominal subject,1,[Autonomous],0,[],[shift]
2,shift,ROOT,,1,[cars],2,"[liability, toward]",[]
3,insurance,compound,,0,[],0,[],"[liability, shift]"
4,liability,dobj,direct object,1,[insurance],0,[],[shift]
5,toward,prep,prepositional modifier,0,[],1,[manufacturers],[shift]
6,manufacturers,pobj,object of preposition,0,[],0,[],"[toward, shift]"


In [8]:
# 4. parse tree visualization
displacy.render(doc, style='dep', jupyter = True, options = {'distance': 120})

### 3. Named Entity Recognition (NER)

In [9]:
# 1. access Entity via "doc.ents" method at document level
raw_text = u"Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(raw_text)

data = {}
for ent in doc.ents:
    data.setdefault('text', []).append(ent.text)
    data.setdefault('start_char', []).append(ent.start_char)
    data.setdefault('end_char', []).append(ent.end_char)
    data.setdefault('label', []).append(ent.label_)
    data.setdefault('explain', []).append(spacy.explain(ent.label_))

pd.DataFrame(data)

Unnamed: 0,text,start_char,end_char,label,explain
0,Apple,0,5,ORG,"Companies, agencies, institutions, etc."
1,U.K.,27,31,GPE,"Countries, cities, states"
2,$1 billion,44,54,MONEY,"Monetary values, including unit"


#### NER attr

Text: The original entity text. <br />
Start: Index of start of entity in the Doc. <br />
End: Index of end of entity in the Doc. <br />
Label: Entity label, i.e. type. <br />

In [10]:
# 2. access Entity at token level
data = {}
for token in doc:
    data.setdefault('text', []).append(token.text)
    data.setdefault('ENT_IOB (hash)', []).append(token.ent_iob)    
    data.setdefault('ENT_IOB_', []).append(token.ent_iob_)
    data.setdefault('ENT_TYPE_', []).append(token.ent_type_)
    data.setdefault('explain', []).append(spacy.explain(token.ent_type_))

pd.DataFrame(data)

Unnamed: 0,text,ENT_IOB (hash),ENT_IOB_,ENT_TYPE_,explain
0,Apple,3,B,ORG,"Companies, agencies, institutions, etc."
1,is,2,O,,
2,looking,2,O,,
3,at,2,O,,
4,buying,2,O,,
5,U.K.,3,B,GPE,"Countries, cities, states"
6,startup,2,O,,
7,for,2,O,,
8,$,3,B,MONEY,"Monetary values, including unit"
9,1,1,I,MONEY,"Monetary values, including unit"


#### IOB SCHEME

I – Token is inside an entity. <br />
O – Token is outside an entity. <br />
B – Token is the beginning of an entity. <br />

In [13]:
# 3. over write or re-edit NER at document level
from spacy.tokens import Span

doc = nlp(u"FB is hiring a new Vice President of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# the model didn't recognise "FB" as an entity :(

ORG = doc.vocab.strings[u'ORG']  # get hash value of entity 'ORG' label
fb_ent = Span(doc, 0, 1, label=ORG) # create a Span, which start from idx 0 to 1, for the new entity
doc.ents = list(doc.ents) + [fb_ent]

ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)

Before []
After [('FB', 0, 2, 'ORG')]


In [14]:
# 4. Entity visualization
displacy.render(doc, style='ent', jupyter=True)

### 4. Tokenization

In [15]:
# 1. add special case tokenization rule
from spacy.symbols import ORTH, LEMMA, POS, TAG   # those are hash values
doc = nlp(u'gimme that')  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# add special case rule
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
nlp.tokenizer.add_special_case(u'gimme', special_case)

# check new tokenization
print([w.text for w in nlp(u'gimme that')])  # ['gim', 'me', 'that']

# Pronoun lemma is returned as -PRON-!
print([w.lemma_ for w in nlp(u'gimme that')])  # ['give', '-PRON-', 'that']

['gimme', 'that']
['gim', 'me', 'that']
['give', '-PRON-', 'that']


#### spacy tokenization algo
1. split by space
2. handle special case or special rules
3. consume prefix
4. consume suffix
5. consume infix
6. can't consume any more, handle as single token

In [16]:
# 2. Customizing spaCy's Tokenizer class
import re
from spacy.tokenizer import Tokenizer

prefix_re = re.compile(r'''^[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)    
# the last one is optional: token_match matching strings 
# that should never be split, overriding the previous rules. 
# Useful for things like URLs or numbers.


nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp(u"hello-world.")
print([t.text for t in doc])

['hello', '-', 'world.']


In [17]:
# 3. Hooking an arbitrary tokenizer into the pipeline
# NOTE: tokenizer is the first component in spacy model process pipeline,
# NOTE: unlike other component, its input is raw text, output is doc object

from spacy.tokens import Doc


class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

# load model to get vocab
nlp = spacy.load('en_core_web_sm')
# need vocal to construct tokenizer object
# assign tokenizer to model.tokenizer
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
# construct doc
doc = nlp(u"What's happened to me? he thought. It wasn't a dream.")

print([t.text for t in doc])

["What's", 'happened', 'to', 'me?', 'he', 'thought.', 'It', "wasn't", 'a', 'dream.']


### 5. Sentence Segmentation

In [18]:
# 1. access sentences via doc.sents method -- returns a generator
# spaCy uses the dependency parse to determine sentence boundaries

doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [20]:
# 2. setting boundaries manually -- there are three ways to do it

In [19]:
# i. add custom pipeline component before dependency parser

text = u"this is a sentence...hello...and another sentence."

doc = nlp(text)
print('Before:', [sent.text for sent in doc.sents])

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == '...':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before='parser')
doc = nlp(text)
print('After:', [sent.text for sent in doc.sents])

Before: ['this is a sentence...hello...and another sentence.']
After: ['this is a sentence...hello...and another sentence.']


In [21]:
# ii. add Rule-based pipeline component -- this will remove dependency parser
from spacy.lang.en import English

nlp = English()  # just the language with no model
sbd = nlp.create_pipe('sentencizer')   # The sentencizer component splits sentences on punctuation like ., ! or ?
nlp.add_pipe(sbd)
doc = nlp(u"This is a sentence. This is another sentence.")

print(doc.is_parsed)

for sent in doc.sents:
    print(sent.text)

False
This is a sentence.
This is another sentence.


In [22]:
# iii. add Custom rule-based strategy -- only modify strategy within a pipeline component, it also remove dependency parser

from spacy.lang.en import English
from spacy.pipeline import SentenceSegmenter


def split_on_newlines(doc):
    """
    This is the strategy function.
    The strategy should be a function that takes a Doc object and yields a Span for each sentence
    """
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline and not word.is_space:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text == '\n':
            seen_newline = True
    if start < len(doc):
        yield doc[start:len(doc)]

nlp = English()  # just the language with no model
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)
doc = nlp(u"This is a sentence\n\nThis is another sentence\nAnd more")

print(doc.is_parsed)

for sent in doc.sents:
    print(sent.text)

False
This is a sentence

This is another sentence

And more


### 6. Rule-based matching