In [0]:
# !python -m spacy download en_core_web_lg

In [0]:
import spacy

# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

**TOKEN**

In [3]:
doc = nlp("This is a text")
[token.text for token in doc]

['This', 'is', 'a', 'text']

**SPAN**

In [4]:
doc = nlp("This is a text")
span = doc[2:5]
span.text

'a text'

In [5]:
from spacy.tokens import Span
doc = nlp("I live in New York")
span = Span(doc, 3, 5, label="GPE")
span.text

'New York'

**STOP WORDS**

In [6]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 326
First ten stop words: ['thereafter', 'therein', 'else', 'had', 'indeed', 'is', 'above', 'put', 'than', 'cannot']


In [7]:
doc = nlp("I live in New York City, the capital of the New York State")
tokens = [token.text for token in doc if not token.is_stop]
print('Original Article: %s' % (doc))
print()
print(tokens)

Original Article: I live in New York City, the capital of the New York State

['live', 'New', 'York', 'City', ',', 'capital', 'New', 'York', 'State']


**LEMMATIZATION**

In [8]:
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
print([token.text for token in doc])
print([token.lemma_ for token in doc])

['Apples', 'and', 'oranges', 'are', 'similar', '.', 'Boots', 'and', 'hippos', 'are', "n't", '.']
['apple', 'and', 'orange', 'be', 'similar', '.', 'boot', 'and', 'hippo', 'be', 'not', '.']


**PART-OF-SPEECH and Syntactic dependencies**

In [9]:
doc = nlp("This is a text.")
print([token.text for token in doc])
print([token.pos_ for token in doc])
print([token.tag_ for token in doc])
print([token.dep_ for token in doc])
print([token.head.text for token in doc])
print(spacy.explain("DET"))
print(spacy.explain("DT"))
print(spacy.explain("nsubj"))

['This', 'is', 'a', 'text', '.']
['DET', 'VERB', 'DET', 'NOUN', 'PUNCT']
['DT', 'VBZ', 'DT', 'NN', '.']
['nsubj', 'ROOT', 'det', 'attr', 'punct']
['is', 'is', 'text', 'is', 'is']
determiner
determiner
nominal subject


**NAMED ENTITIES**

In [10]:
doc = nlp("Larry Page founded Google")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Larry Page', 'PERSON'), ('Google', 'ORG')]


**SENTENCES**

In [11]:
doc = nlp("This a sentence. This is another one.")
print([sent.text for sent in doc.sents])

['This a sentence.', 'This is another one.']


**CHUNK and Base noun phrases**

In [12]:
doc = nlp("I have a red car")
print([chunk.text for chunk in doc.noun_chunks])

['I', 'a red car']


**WORD VECTORS and SIMILARITY**

In [13]:
doc1 = nlp("I like cats")
doc2 = nlp("I like dogs")
print(doc1.similarity(doc2))
print(doc1[2], "-", doc2[2], doc1[2].similarity(doc2[2]))
print(doc1[0], "-", doc2[1:3], doc1[0].similarity(doc2[1:3]))

0.957709143352323
cats - dogs 0.83117634
I - like dogs 0.46475166


In [14]:
doc = nlp("I like cats")
print(doc[2])
print(doc[2].vector)
print(doc[2].vector_norm)

cats
[-0.26763    0.029846  -0.3437    -0.54409   -0.49919    0.15928
 -0.35278   -0.2036     0.23482    1.5671    -0.36458   -0.028713
 -0.27053    0.2504    -0.18126    0.13453    0.25795    0.93213
 -0.12841   -0.18505   -0.57597    0.18538   -0.19147   -0.38465
  0.21656   -0.4387    -0.27846   -0.41339    0.37859   -0.2199
 -0.25907   -0.019796  -0.31885    0.12921    0.22168    0.32671
  0.46943   -0.81922   -0.20031    0.013561  -0.14663    0.14438
  0.0098044 -0.15439    0.21146   -0.28409   -0.4036     0.45355
  0.12173   -0.11516   -0.12235   -0.096467  -0.26991    0.028776
 -0.11307    0.37219   -0.054718  -0.20297   -0.23974    0.86271
  0.25602   -0.3064     0.014714  -0.086497  -0.079054  -0.33109
  0.54892    0.20076    0.28064    0.037788   0.0076729 -0.0050123
 -0.11619   -0.23804    0.33027    0.26034   -0.20615   -0.35744
  0.54125   -0.3239     0.093441   0.17113   -0.41533    0.13702
 -0.21765   -0.65442    0.75733    0.359      0.62492    0.019685
  0.21156    0.2

**PIPELINE**

In [0]:
def custom_component(doc):
 print("Do something to the doc here!")
 return doc

nlp.add_pipe(custom_component, first=True)

In [16]:
print(nlp.pipe_names)
print(nlp.pipeline)

['custom_component', 'tagger', 'parser', 'ner']
[('custom_component', <function custom_component at 0x7fd21c73e0d0>), ('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fd23229b550>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fd22fe61228>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fd22fe61288>)]


**VISUALIZATION**

In [17]:
from spacy import displacy
doc = nlp("This is a sentence")
displacy.render(doc, style="dep")

Do something to the doc here!


'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="0c841441a670430fac7ce5e1ed9a889b-0" class="displacy" width="750" height="312.0" direction="ltr" style="max-width: none; height: 312.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="222.0">\n    <tspan class="displacy-word" fill="currentColor" x="50">This</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">DET</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="222.0">\n    <tspan class="displacy-word" fill="currentColor" x="225">is</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">VERB</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="222.0">\n    <tspan class="displacy-word" fill="currentColor" x="400">a</tspan>\n    <tspan class="displacy-tag" dy=

In [18]:
doc = nlp("Larry Page founded Google")
displacy.render(doc, style="ent")

Do something to the doc here!


'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">\n    Larry Page\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n founded \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">\n    Google\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n</div>'