 Spacy has inbuilt word embeddings that one can use by loading an appropriate model. For example for the English model, you need to load either a midsize or a large model to access these word embeddings

In [1]:
# !python -m spacy download en_core_web_lg

In [2]:
import spacy
# run "python -m spacy download en_core_web_lg" to install large english model

nlp = spacy.load("en_core_web_lg")

In [3]:
doc = nlp("dog cat banana afskfsd")

for token in doc:
  print(token.text,"Vector:", token.has_vector,"OOV:", token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
afskfsd Vector: False OOV: True


In [4]:
# print particular vector
doc[0].vector

array([-4.0176e-01,  3.7057e-01,  2.1281e-02, -3.4125e-01,  4.9538e-02,
        2.9440e-01, -1.7376e-01, -2.7982e-01,  6.7622e-02,  2.1693e+00,
       -6.2691e-01,  2.9106e-01, -6.7270e-01,  2.3319e-01, -3.4264e-01,
        1.8311e-01,  5.0226e-01,  1.0689e+00,  1.4698e-01, -4.5230e-01,
       -4.1827e-01, -1.5967e-01,  2.6748e-01, -4.8867e-01,  3.6462e-01,
       -4.3403e-02, -2.4474e-01, -4.1752e-01,  8.9088e-02, -2.5552e-01,
       -5.5695e-01,  1.2243e-01, -8.3526e-02,  5.5095e-01,  3.6410e-01,
        1.5361e-01,  5.5738e-01, -9.0702e-01, -4.9098e-02,  3.8580e-01,
        3.8000e-01,  1.4425e-01, -2.7221e-01, -3.7016e-01, -1.2904e-01,
       -1.5085e-01, -3.8076e-01,  4.9583e-02,  1.2755e-01, -8.2788e-02,
        1.4339e-01,  3.2537e-01,  2.7226e-01,  4.3632e-01, -3.1769e-01,
        7.9405e-01,  2.6529e-01,  1.0135e-01, -3.3279e-01,  4.3117e-01,
        1.6687e-01,  1.0729e-01,  8.9418e-02,  2.8635e-01,  4.0117e-01,
       -3.9222e-01,  4.5217e-01,  1.3521e-01, -2.8878e-01, -2.28

In [5]:
doc[0].vector.shape

(300,)

In [6]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [7]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
  print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6874560014053445
burger <-> bread: 0.5440373883702087
car <-> bread: 0.1644114584391833
tiger <-> bread: 0.14492353269643002
human <-> bread: 0.21103659037655728
wheat <-> bread: 0.6572456428272563


In [8]:
def print_similarity(base_word, words_to_compare):
  base_token = nlp(base_word)
  doc = nlp(words_to_compare)

  for token in doc:
     print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))


In [9]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone: 0.6339781147910419
samsung <-> iphone: 0.6678678014329177
iphone <-> iphone: 1.0
dog <-> iphone: 0.17431037640553934
kitten <-> iphone: 0.14685812907484028


In [10]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.7880844]], dtype=float32)