`# Word Vectors in SpaCy`

In [1]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [2]:
%config Completer.use_jedi = False

In [3]:
nlp('lion').vector.shape

(300,)

In [4]:
nlp('My name is Rudra! I am 26 years young :D').vector.shape

(300,)

In [5]:
tokens = nlp('lion ball tree')

In [6]:
for t1 in tokens:
    for t2 in tokens:
        print(t1.text, t2.text, t1.similarity(t2))

lion lion 1.0
lion ball 0.19351771
lion tree 0.35865536
ball lion 0.19351771
ball ball 1.0
ball tree 0.26132903
tree lion 0.35865536
tree ball 0.26132903
tree tree 1.0


In [7]:
# How big is SpaCys Vocabulary
len(nlp.vocab.vectors)

684831

In [8]:
# Let's check the shape
nlp.vocab.vectors.shape

(684831, 300)

In [9]:
# Check if a vector exists
tokens = nlp(u'Hello Apple JohnCena')

In [10]:
for t in tokens:
    print(t.text, t.has_vector, t.vector_norm, t.is_oov)

Hello True 5.586428 False
Apple True 7.1346846 False
JohnCena False 0.0 True


`# Calculating Cosine Similarity`

In [11]:
from scipy import spatial

cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [12]:
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
king = nlp.vocab['king'].vector

new_vector = king - man + woman ## This new_vector should be similar to queen or princess

In [13]:
# finding the possible computed similarities
computed_similarity = []

## Finding all words in the vocabulary
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:  # should not be a number
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarity.append((word, similarity)) ## List of tuples

In [14]:
## Let's sort the list of tuples now
computed_similarity = sorted(computed_similarity, key=lambda item:-item[1]) ## `lambda item:-item[1]` means the descending order of the item at index(1) which is `similarity`.
## This is a fancy way of sorting the tuples based on descending values of their similarity.

In [15]:
print([t[0].text for t in computed_similarity[:10]]) # grabbing first word in tuple and doing that for top 10 words.

['king', 'woman', 'she', 'lion', 'who', 'young', 'when', 'dare', 'was', 'not']
