## Install

In [None]:
!pip install -U spaCy
!python -m spacy download en

## Simple Tokenization

In [5]:
import spacy
nlp = spacy.load('en')
doc = nlp('Hello     World!')
for i, token in enumerate(doc):
    print(i, token.text)

0 Hello
1     
2 World
3 !


Notice the index preserving tokenization in action. Rather than only keeping the words, spaCy keeps the spaces too. This is helpful for situations when you need to replace words in the original text or add some annotations. With NLTK tokenization, there’s no way to know exactly where a tokenized word is in the original raw text. spaCy preserves this “link” between the word and its place in the raw text. Here’s how to get the exact index of a word:

In [25]:
import spacy
import pandas

nlp = spacy.load('en')
doc = nlp('Hello     World!')

df = pd.DataFrame([(token.text, token.idx) for token in doc], columns=['text','idx'])
df

Unnamed: 0,text,idx
0,Hello,0
1,,6
2,World,10
3,!,15


In [24]:
doc = nlp("Next week I'll   be in Madrid.")

columns = ['text','idx','lemma','is_punct','is_space','shape','pos','tag']

df = pd.DataFrame([(token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_) for token in doc], columns=columns)

df

Unnamed: 0,text,idx,lemma,is_punct,is_space,shape,pos,tag
0,Next,0,next,False,False,Xxxx,ADJ,JJ
1,week,5,week,False,False,xxxx,NOUN,NN
2,I,10,-PRON-,False,False,X,PRON,PRP
3,'ll,11,will,False,False,'xx,VERB,MD
4,,15,,False,True,,SPACE,_SP
5,be,17,be,False,False,xx,VERB,VB
6,in,20,in,False,False,xx,ADP,IN
7,Madrid,23,Madrid,False,False,Xxxxx,PROPN,NNP
8,.,29,.,True,False,.,PUNCT,.


## Bag of Words

In [88]:
import spacy
from collections import Counter

nlp = spacy.load("en")
text = """Most of the outlay will be at home. No surprise there, either. While Samsung has expanded overseas, South Korea is still host to most of its factories and research engineers. Samsung is expected to keep this expansion in the near future."""
doc = nlp(text)

#remove stopwords and punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]

word_freq = Counter(words)
common_words = word_freq.most_common(5)

print (common_words)

[('Samsung', 2), ('outlay', 1), ('home', 1), ('surprise', 1), ('expanded', 1)]


## Sentence detection

In [26]:
doc = nlp("These are apples. These are oranges.")
 
for sent in doc.sents:
    print(sent)

These are apples.
These are oranges.


In [29]:
# notice it deals with abbreviations far better than NLTK
doc = nlp("I met Dr. Brown yesterday and he told me he's see me after Mr. Smith. I accepted his suggestion.")

for sent in doc.sents:
    print(sent)

I met Dr. Brown yesterday and he told me he's see me after Mr. Smith.
I accepted his suggestion.


## Part of Speech Tagging

In [37]:
doc = nlp("Next week I'll be in Madrid.")
df = pd.DataFrame([(token.text, token.tag_) for token in doc], columns=['text','tag'])
df['explanation'] = [spacy.explain(r) for r in df.tag]
df

Unnamed: 0,text,tag,explanation
0,Next,JJ,adjective
1,week,NN,"noun, singular or mass"
2,I,PRP,"pronoun, personal"
3,'ll,MD,"verb, modal auxiliary"
4,be,VB,"verb, base form"
5,in,IN,"conjunction, subordinating or preposition"
6,Madrid,NNP,"noun, proper singular"
7,.,.,"punctuation mark, sentence closer"


## Named Entity Recognition

In [38]:
doc = nlp("Next week I'll be in Madrid.")
pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents],columns=['text','label'])

Unnamed: 0,text,label
0,Next week,DATE
1,Madrid,GPE


In [39]:
doc = nlp("I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ")
pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents],columns=['text','label'])

Unnamed: 0,text,label
0,2,CARDINAL
1,9 a.m.,TIME
2,30%,PERCENT
3,just 2 days,DATE
4,WSJ,ORG


In [40]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

## Chunking

In [43]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
# root is the "main" word of the sentence
pd.DataFrame([(chunk.text, chunk.label_, chunk.root.text) for chunk in doc.noun_chunks],columns=['text','label','root_text'])

Unnamed: 0,text,label,root_text
0,Wall Street Journal,NP,Journal
1,an interesting piece,NP,piece
2,crypto currencies,NP,currencies


## Dependency Parsing

In [49]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
df = pd.DataFrame([(token.text, 
               token.tag_, 
               token.dep_, 
               token.head.text, 
               token.head.tag_) for token in doc],columns=['text','tag','dep','head_text','head_tag'])
df['explanation'] = [spacy.explain(r) for r in df.tag]
df

Unnamed: 0,text,tag,dep,head_text,head_tag,explanation
0,Wall,NNP,compound,Street,NNP,"noun, proper singular"
1,Street,NNP,compound,Journal,NNP,"noun, proper singular"
2,Journal,NNP,nsubj,published,VBD,"noun, proper singular"
3,just,RB,advmod,published,VBD,adverb
4,published,VBD,ROOT,published,VBD,"verb, past tense"
5,an,DT,det,piece,NN,determiner
6,interesting,JJ,amod,piece,NN,adjective
7,piece,NN,dobj,published,VBD,"noun, singular or mass"
8,on,IN,prep,piece,NN,"conjunction, subordinating or preposition"
9,crypto,JJ,amod,currencies,NNS,adjective


In [50]:
from spacy import displacy
 
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

## Word Vectors

spaCy comes shipped with a Word Vector model as well. We’ll need to download a larger model for that:

In [51]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz#egg=en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K    100% |████████████████████████████████| 826.9MB 10.6MB/s ta 0:00:0110% |                                | 2.3MB 15.6MB/s eta 0:00:53    39% |████████████▋                   | 324.8MB 13.4MB/s eta 0:00:38    66% |█████████████████████▎          | 548.4MB 10.6MB/s eta 0:00:27    73% |███████████████████████▌        | 606.7MB 13.7MB/s eta 0:00:17��██████     | 700.0MB 37.5MB/s eta 0:00:04
[?25hInstalling collected packages: en-core-web-lg
  Found existing installation: en-core-web-lg 2.0.0
    Uninstalling en-core-web-lg-2.0.0:
      Successfully uninstalled en-core-web-lg-2.0.0
  Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core

If two words are similar, they appear in similar contexts
Word vectors are computed taking into account the context (surrounding words)
Given the two previous observations, similar words should have similar word vectors
Using vectors we can derive relationships between words

In [53]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

There’s a really famous example of word embedding math: "man" - "woman" + "queen" = "king". It sounds pretty crazy to be true, so let’s test that out:

In [54]:
%%time
from scipy import spatial
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
 
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'Kings', 'KINGS', 'kings']


## Computing Similarity

In [64]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']

print(f"Dog Similarity\nanimal: {dog.similarity(animal)}\nfruit: {dog.similarity(fruit)}")
print()
print(f"Banana Similarity\nanimal: {banana.similarity(animal)}\nfruit: {banana.similarity(fruit)}")

Dog Similarity
animal: 0.6618534326553345
fruit: 0.23552849888801575

Banana Similarity
animal: 0.24272850155830383
fruit: 0.6714836359024048


In [71]:
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")

print(target.similarity(doc1))
print(target.similarity(doc2))
print(target.similarity(doc3))

0.8901765218466683
0.9115827883983011
0.7822955760597128


## Extension

In [74]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/quick/nltk_data...


True

We can set extensions on Doc, Token or Spans. Let's use a Doc extension here

In [85]:
import spacy
from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer
 
sentiment_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)
 
Doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
 
nlp = spacy.load('en')
doc = nlp("The phone is super cool!!!")
print(doc._.polarity_scores)

{'neg': 0.0, 'neu': 0.298, 'pos': 0.702, 'compound': 0.795}


In [86]:
dir(spacy)

['Errors',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_align',
 '_ml',
 'about',
 'attrs',
 'blank',
 'cli',
 'cli_info',
 'compat',
 'displacy',
 'errors',
 'explain',
 'glossary',
 'gold',
 'info',
 'lang',
 'language',
 'lemmatizer',
 'lexeme',
 'load',
 'matcher',
 'morphology',
 'parts_of_speech',
 'pipeline',
 'prefer_gpu',
 'require_gpu',
 'scorer',
 'strings',
 'symbols',
 'syntax',
 'sys',
 'tokenizer',
 'tokens',
 'unicode_literals',
 'util',
 'vectors',
 'vocab',