In [2]:
from spacy.lang.en import English

In [3]:
nlp = English()

In [4]:
doc = nlp("Hello World!")
for token in doc:
    print(token.text)

Hello
World
!


In [5]:
token = doc[1]
print(token.text)

World


In [6]:
span = doc[1:4]
print(span.text)

World!


In [7]:
doc = nlp("It Cost $5.")
print('Index: ',[token.i for token in doc])
print('Text: ',[token.text for token in doc])
print('is_alpha:',[token.is_alpha for token in doc])
print('is_punct:',[token.is_punct for token in doc])
print('like.num:',[token.like_num for token in doc])

Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'Cost', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like.num: [False, False, False, True, False]


### German Lang. Imported

In [8]:
from spacy.lang.de import German

In [9]:
nlp = German()
doc = nlp("Liebe Grüße") #"Kind Regards"
print(doc.text)

Liebe Grüße


### Slicing and Tokenixation

In [13]:
from spacy.lang.en import English

nlp = English()
doc = nlp("I like tree kangaroos and narwhals.")
first_token = doc[0]
print(first_token.text)

I


In [16]:
first_token = doc[1:3]
first_token

like tree

In [19]:
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

tree kangaroos


In [20]:
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos and narwhals


In [26]:
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.")

for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


### Statistical Models "Extra Package"

In [31]:
pip install en_core_web_sm

Note: you may need to restart the kernel to use updated packages.


In [32]:
import spacy as sp

nlp = sp.load('en_core_web_sm')

In [41]:
doc = nlp("She ate the pizza")
for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [42]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [56]:
doc =nlp("Larry Page is an owner and co-founder of Google")
for ent in doc.ents:
    print(ent.text, ent.label_)

Larry Page PERSON
Google ORG


In [57]:
sp.explain("det")

'determiner'

In [58]:
sp.explain('dobj')

'direct object'

In [59]:
for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print("{:<12}{:<10}{:<10}".format(token_text, token_pos, token_dep))

Larry       PROPN     compound  
Page        PROPN     nsubj     
is          AUX       ROOT      
an          DET       det       
owner       NOUN      attr      
and         CCONJ     cc        
co          NOUN      dep       
-           NOUN      dep       
founder     NOUN      conj      
of          ADP       prep      
Google      PROPN     pobj      


In [60]:
sp.explain("aux")

'auxiliary'

In [61]:
sp.explain("relcl")

'relative clause modifier'

In [63]:
from spacy.matcher import Matcher

In [64]:
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)
doc = nlp("New iPhone X release date leaked")
matches = matcher(doc)