In [3]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en import English

## Tokenizer

In [4]:
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


## Sentencizer

In [5]:
# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


## Removing stopwords

In [6]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['go', 'regarding', 'hereby', 'were', 'over', 'herself', 'please', 'whence', 'i', 'within', 'may', 'than', 'only', '’s', 'beyond', 'say', 'two', 'they', 'was', 'done']


In [7]:
from spacy.lang.en.stop_words import STOP_WORDS

# Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:", filtered_sent)

Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


## Lemmatization

In [8]:
# Implementing lemmatization
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run run
runs run
running run
runner runner


## POS tagging

In [9]:
# load en_core_web_md of English for vocabluary, syntax & entities
nlp = spacy.load('en_core_web_md')

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All DET
is AUX
well ADV
that DET
ends VERB
well ADV
. PUNCT


## Entity detection

In [10]:
from spacy import displacy

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases. At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday. The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (Williamsburg, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox Jews, 'NORP', 381),
 (6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

In [11]:
displacy.render(nytimes, style="ent", jupyter=True)

## Dependency parsing

In [12]:
docp = nlp ("In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [13]:
displacy.render(docp, style="dep", jupyter=True)

## Word vectors

In [14]:
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(300,)
[-2.8445e-01  5.5363e-01  4.9800e-01 -2.7769e-01  9.3481e-02  5.8978e-01
 -6.1267e-01  1.5415e-01  3.7752e-01  1.4348e-01 -2.0126e-01  2.6869e-01
 -7.2758e-01  2.4405e-01  3.5321e-01 -3.8314e-01  1.8920e-01  9.0860e-01
  2.0685e-01  6.7174e-02  4.9190e-01  4.8224e-01  1.2929e-01  4.9490e-01
 -9.3981e-02 -6.0443e-01  2.8314e-01 -7.4459e-02  5.9333e-02 -9.5484e-01
 -1.2755e-01  1.1871e-01  2.9725e-01 -1.8604e-01 -4.9672e-01 -1.4352e-01
 -1.2770e-02  1.0423e-01 -6.7861e-01  7.6421e-01  2.0364e-02 -3.7836e-02
  2.9399e-01 -4.1602e-01  3.1965e-01  9.8503e-01 -6.8950e-02 -4.0057e-01
 -1.3972e-01  3.2916e-01 -8.3725e-02  7.1081e-02  4.5407e-01 -6.0930e-02
  8.5099e-01 -7.3595e-01  3.5860e-01 -1.7554e-01 -1.5838e-01  3.1525e-02
 -1.4029e-01  9.3486e-02  4.2341e-01  2.2277e-01 -8.6968e-02 -4.4123e-01
  1.1326e-01  1.1435e-01 -8.7185e-01  5.2618e-01 -8.7317e-02  8.8284e-01
 -2.1643e-01  4.1260e-01 -1.9629e-01  2.4883e-01  8.2752e-02 -1.0800e-01
 -3.7362e-01 -2.0995e-01  7.9307e-02 -2.3805

## Similarity

In [15]:
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327643
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154364
banana dog 0.24327643
banana cat 0.28154364
banana banana 1.0


In [16]:
doc1 = nlp('I love dogs')
doc2 = nlp('I hate dogs')
doc3 = nlp('I love cats')
doc4 = nlp('I hate cats')

print(doc1.similarity(doc2))
print(doc1.similarity(doc3))
print(doc1.similarity(doc4))

0.9408163917661322
0.9630646934125359
0.9065616726694914
