In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
# nlp(u'fox').vector

In [4]:
# nlp(u'The quick brown fox jumped').vector

In [5]:
nlp(u'fox').vector.shape

(300,)

In [6]:
nlp(u'The quick brown fox jumped').vector.shape

(300,)

In [7]:
tokens = nlp(u'lion cat pet')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))
    print('-----')

# Notice cat pet is higher than lion pet

lion lion 1.0
lion cat 0.5265437
lion pet 0.39923772
-----
cat lion 0.5265437
cat cat 1.0
cat pet 0.7505456
-----
pet lion 0.39923772
pet cat 0.7505456
pet pet 1.0
-----


Words use in Similar Context often have High Similarities between them, even though in normal english they have the opposite meaning <br>
for example here love hate; you either love the movie or hate them

In [8]:
tokens = nlp(u'like love hate')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))
    print('-----')

like like 1.0
like love 0.657904
like hate 0.6574652
-----
love like 0.657904
love love 1.0
love hate 0.6393099
-----
hate like 0.6574652
hate love 0.6393099
hate hate 1.0
-----


In [9]:
# We have around 20.000 unique words in vocabulary that we have vectors for
len(nlp.vocab.vectors)

20000

In [10]:
nlp.vocab.vectors.shape

(20000, 300)

In [11]:
tokens = nlp(u"dog cat nargle")

In [14]:
for token in tokens:
    print(token.text,' | ', token.has_vector,' | ', token.vector_norm,' | ', token.is_oov)    
# oov: Out Of Vocabulary, Check whether that word is inside one of the 20,000+ word in the spacy liblary

dog  |  True  |  7.0336733  |  False
cat  |  True  |  6.6808186  |  False
nargle  |  False  |  0.0  |  True


### Calcualting Similar Word based on its Vector Distance

In [15]:
from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [16]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

King - man + woman ---> NEW_VECTOR similar to Queen, Princess, Highness

In [17]:
new_vector = king - man + woman

In [18]:
computed_similarities = []

# FOR ALL WORDS IN SPACY LIBLARY
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:   # Not a number
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

In [19]:
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1]) # Desending Order (-) based from item index 1

In [20]:
# Print top 10 Similar Words
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']
