In [1]:
#Import library
import spacy

In [2]:
#Loading the default model english-core-web
nlp = spacy.load("en_core_web_sm")

## Tokenization

In [14]:
#tokenization
doc = nlp("Abdul kalam came into this world on 15th October 1931, at a time when India was under British occupation")
for token in doc:
    print(token.text)

Abdul
kalam
came
into
this
world
on
15th
October
1931
,
at
a
time
when
India
was
under
British
occupation


In [15]:
# first token of the doc 
doc[0]

Abdul

In [16]:
# last token of the doc  
doc[len(doc)-1]

occupation

## Part-of-Speech Tagging

In [25]:
doc = nlp("Bob bought a car and Mary bought a cycle")

In [26]:
#tokenize and part of speech tag the individual tokens
for token in doc:
    print("\t".join( (token.text, str(token.idx), token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, str(token.is_alpha), str(token.is_stop) )))

Bob	0	Bob	PROPN	NNP	nsubj	Xxx	True	False
bought	4	buy	VERB	VBD	ROOT	xxxx	True	False
a	11	a	DET	DT	det	x	True	True
car	13	car	NOUN	NN	dobj	xxx	True	False
and	17	and	CCONJ	CC	cc	xxx	True	True
Mary	21	Mary	PROPN	NNP	nsubj	Xxxx	True	False
bought	26	buy	VERB	VBD	conj	xxxx	True	False
a	33	a	DET	DT	det	x	True	True
cycle	35	cycle	NOUN	NN	dobj	xxxx	True	False


## Dependency Parse

In [27]:
#Printing Dependency Parse relations
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

Bob nsubj bought VERB []
bought ROOT bought VERB [Bob, car, and, bought]
a det car NOUN []
car dobj bought VERB [a]
and cc bought VERB []
Mary nsubj bought VERB []
bought conj bought VERB [Mary, cycle]
a det cycle NOUN []
cycle dobj bought VERB [a]


## Named Entity Recognition

In [28]:
#Printing Named Entity labels
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Bob 0 3 PERSON
Mary 21 25 PERSON


In [29]:
doc = nlp(u'Ali Hassan Kuban said that Apple Inc. will buy Google in May 2018.')

In [30]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Ali Hassan Kuban 0 16 PERSON
Apple Inc. 27 37 ORG
Google 47 53 ORG
May 2018 57 65 DATE


## Pattern Matching in spaCy

In [32]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{'LOWER': 'hi'}, {'IS_PUNCT': True}, {'LOWER': 'everyone'}]
matcher.add('Hieveryone', None, pattern)

doc = nlp(u'Hi, everyone! Hi... everyone!')
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)
print("-" * 50)
doc = nlp(u'Hi, everyone! Hi everyone!')
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

3834098557791477199 Hieveryone 0 3 Hi, everyone
3834098557791477199 Hieveryone 4 7 Hi... everyone
--------------------------------------------------
3834098557791477199 Hieveryone 0 3 Hi, everyone


## Using the Dependency Parse Visualizer

In [33]:
import spacy

In [34]:
#Loading the visualizer
from spacy import displacy

In [35]:
#Loading the English NLP pipeline
nlp = spacy.load("en_core_web_sm")

In [39]:
doc = nlp(u"Dick ran and Jane danced yesterday.")

In [40]:
#visualization
displacy.render(doc, style='dep', jupyter=True, options={"distance": 120})

## Similarities in Context

In [51]:
#Using spaCy documentation and checking for the word labrador
tokens = nlp(u'labrador')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

labrador True 18.16932 False


In [52]:
#Test for the context
doc1 = nlp(u"The labrador barked.")
doc2 = nlp(u"The labrador swam.")
doc3 = nlp(u"the labrador people live in canada.")

dog = nlp(u"dog")

count = 0
for doc in [doc1, doc2, doc3]:
    lab = doc[1]
    count += 1
    print(str(count) + ":", lab.similarity(dog))

1: 0.2024305733274931
2: 0.21782485927082987
3: 0.3072976816522165


  # This is added back by InteractiveShellApp.init_path()


In [53]:
# compute document or text similarities 
docs = ( nlp(u"Paris is the largest city in France."),
        nlp(u"Vilnius is the capital of Lithuania."),
        nlp(u"An emu is a large bird.") )

for x in range(len(docs)):
    for y in range(len(docs)):
        print(x, y, docs[x].similarity(docs[y]))

0 0 1.0
0 1 0.7710339939885976
0 2 0.6094215223832394
1 0 0.7710339939885976
1 1 1.0
1 2 0.4676224047022703
2 0 0.6094215223832394
2 1 0.4676224047022703
2 2 1.0


  import sys


In [54]:
#varing the word order in sentences and comparing them
docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
        nlp(u"man dog bites"), nlp(u"cat eats mouse")]

for doc in docs:
    for other_doc in docs:
        print('"' + doc.text + '"', '"' + other_doc.text + '"', doc.similarity(other_doc))

"dog bites man" "dog bites man" 1.0
"dog bites man" "man bites dog" 0.956404733273795
"dog bites man" "man dog bites" 0.8663885081525097
"dog bites man" "cat eats mouse" 0.6722633933080862
"man bites dog" "dog bites man" 0.956404733273795
"man bites dog" "man bites dog" 1.0
"man bites dog" "man dog bites" 0.862005529432172
"man bites dog" "cat eats mouse" 0.6647568936545546
"man dog bites" "dog bites man" 0.8663885081525097
"man dog bites" "man bites dog" 0.862005529432172
"man dog bites" "man dog bites" 1.0
"man dog bites" "cat eats mouse" 0.603555855034238
"cat eats mouse" "dog bites man" 0.6722633933080862
"cat eats mouse" "man bites dog" 0.6647568936545546
"cat eats mouse" "man dog bites" 0.603555855034238
"cat eats mouse" "cat eats mouse" 1.0


  
