In [5]:
#pip install spacy in Anaconda Command prompt
import spacy

In [7]:
#python -m spacy download en_core_web_sm in Anaconda Command prompt
import en_core_web_sm
nlp = en_core_web_sm.load()

In [8]:
para = nlp("Alibaba's Ant Financial-backed online food delivery and restaurant discovery platform Zomato has acquired Uber Eats, the food delivery business of ride-hailing giant Uber India for around Rs 2,485 crore ($350 million) in an all-stock deal.")
for words in para:
    print(words.text, words.pos_, words.dep_, words.lemma_, words.tag_, words.shape_, words.is_alpha, words.is_stop)
    #text is the original text of the lexeme
    #pos is the simple part-of-speech tag
    #dep is the syntactic dependency i.e. the relation between tokens
    #lemma is the base form of the word
    #tag is the detailed part-of-speech tag
    #shape is the word shape - capitalization, punctuation, digits
    #is_alpha is whether the token is an alpha character
    #is_stop is whether the token is part of a stop list i.e. the most common words of the language

Alibaba PROPN poss Alibaba NNP Xxxxx True False
's PART case 's POS 'x False True
Ant PROPN nmod Ant NNP Xxx True False
Financial PROPN npadvmod Financial NNP Xxxxx True False
- PUNCT punct - HYPH - False False
backed VERB amod back VBN xxxx True False
online ADJ amod online JJ xxxx True False
food NOUN compound food NN xxxx True False
delivery NOUN nsubj delivery NN xxxx True False
and CCONJ cc and CC xxx True True
restaurant NOUN compound restaurant NN xxxx True False
discovery NOUN compound discovery NN xxxx True False
platform NOUN compound platform NN xxxx True False
Zomato PROPN conj Zomato NNP Xxxxx True False
has AUX aux have VBZ xxx True True
acquired VERB ROOT acquire VBN xxxx True False
Uber PROPN compound Uber NNP Xxxx True False
Eats PROPN dobj Eats NNP Xxxx True False
, PUNCT punct , , , False False
the DET det the DT xxx True True
food NOUN compound food NN xxxx True False
delivery NOUN compound delivery NN xxxx True False
business NOUN appos business NN xxxx True False


In [9]:
for entities in para.ents:
    print(entities.text, entities.start_char, entities.end_char, entities.label_)
    #text is the original entity text
    #start_char is the index of start of entity
    #end_char is the index of end of entity
    #label is entity label i.e. type

Alibaba 0 7 ORG
Ant Financial 10 23 ORG
Zomato 86 92 GPE
Uber Eats 106 115 ORG
Uber India 166 176 ORG
2,485 191 196 CARDINAL
$350 million 204 216 MONEY


In [10]:
for words in para:
    print(words.text, words.has_vector, words.vector_norm, words.is_oov)
    #has_vector is whether the token has a vector representation 
    #vector_norm is the L2 norm of the token’s vector (the square root of the sum of the values squared) 
    #is_oov is Out-of-vocabulary

Alibaba True 21.996586 True
's True 21.219128 True
Ant True 21.123112 True
Financial True 19.393559 True
- True 23.283134 True
backed True 21.71244 True
online True 17.555346 True
food True 20.328575 True
delivery True 19.069447 True
and True 22.130964 True
restaurant True 18.034246 True
discovery True 18.531908 True
platform True 17.778807 True
Zomato True 22.53563 True
has True 22.474064 True
acquired True 21.299208 True
Uber True 20.857506 True
Eats True 21.80761 True
, True 21.101221 True
the True 20.774448 True
food True 19.287928 True
delivery True 18.139269 True
business True 18.707468 True
of True 24.315151 True
ride True 19.188444 True
- True 22.68037 True
hailing True 21.382889 True
giant True 19.886787 True
Uber True 21.241734 True
India True 21.995556 True
for True 20.265371 True
around True 20.628675 True
Rs True 17.769388 True
2,485 True 21.765858 True
crore True 21.0941 True
( True 20.21262 True
$ True 22.927814 True
350 True 23.755072 True
million True 23.420534 True
) 

In [11]:
print(para.vocab.strings["unique"])
#gives the id for a given string

5983206397430101660


In [18]:
#python -m spacy download en_core_web_md in Anaconda Command prompt
import en_core_web_md
nlp = en_core_web_md.load()
tokens = nlp("subject word intelligence lalalala")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

subject True 5.62777 False
word True 5.8387117 False
intelligence True 7.082624 False
lalalala True 6.225437 False


In [19]:
tokens = nlp("subject word intelligence")
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))
        #similarity - returns a float value that a scalar similarity score. Higher is more similar.

subject subject 1.0
subject word 0.35317358
subject intelligence 0.2781774
word subject 0.35317358
word word 1.0
word intelligence 0.30542892
intelligence subject 0.2781774
intelligence word 0.30542892
intelligence intelligence 1.0


In [26]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world. Here are two sentences.")
print([t.text for t in doc])

['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']


In [42]:
#Training and updating neural network models
import random
nlp = spacy.load("en_core_web_sm")
train_data = [("Alibaba's Ant Financial-backed online food delivery and restaurant discovery platform Zomato has acquired Uber Eats, the food delivery business of ride-hailing giant Uber India for around Rs 2,485 crore ($350 million) in an all-stock deal.", {"entities": [(0, 4, "ORG")]})]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] #get names of other pipes to disable them during training
print(other_pipes)
with nlp.disable_pipes(*other_pipes): #only train NER
    optimizer = nlp.begin_training() #reset and initialize the weights randomly – but only if we're training a new model
    for i in range(10):
        random.shuffle(train_data) 
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer) #update the model
nlp.to_disk("/model") # save the trained model
print(nlp)

['tagger', 'parser']
<spacy.lang.en.English object at 0x00000142BA61B888>


In [37]:
#serialization
#If you’ve been modifying the pipeline, vocabulary, vectors and entities, or made updates to the model, you’ll eventually want to save your progress
#This means you’ll have to translate its contents and structure into a format that can be saved, like a file or a byte string. This process is called serialization. spaCy comes with built-in serialization methods and supports the Pickle protocol.
from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load("en_core_web_sm")
customer_feedback = open("customer_feedback_1.txt").read()
doc = nlp(customer_feedback)
doc.to_disk("/customer_feedback_1.bin")
#Can't execute below code because permission denied
#new_doc = Doc(Vocab()).from_disk("/customer_feedback_1.bin")

Hello
Hi


PermissionError: [Errno 13] Permission denied: '\\customer_feedback_1.bin'

In [38]:
#get word vectors and similarity
nlp = en_core_web_md.load()
doc = nlp("Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
print("apple <-> banana", apple.similarity(banana))
print("pasta <-> hippo", pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

apple <-> banana 0.5831844
pasta <-> hippo 0.120697394
True True True True
