#Word Embeddings - NLP

Converting words in vector format, such that the meaning of the word is maintained.

In [4]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m859.5 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [8]:
doc = nlp("dog cat banana manav")

for token in doc:
  print(token.text, "Vector:", token.has_vector, "OOV:",token.is_oov)


dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
manav Vector: False OOV: True


In [10]:
doc[0].vector.shape

(300,)

In [38]:
base_token = nlp("Food")
base_token.vector.shape

(300,)

In [41]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text} : ", token.similarity(base_token))

print("\n")
base = nlp("banana")

for token in doc:
    print(token.text ,"-", base.text,":", token.similarity(base_token))

bread <-> Food :  0.18048371057538962
sandwich <-> Food :  0.21470992715139706
burger <-> Food :  0.2838524955013364
car <-> Food :  -0.1186120285659198
tiger <-> Food :  -0.0009994601374418964
human <-> Food :  -0.02015259609131672
wheat <-> Food :  0.1894259751643385


bread - banana : 0.18048371057538962
sandwich - banana : 0.21470992715139706
burger - banana : 0.2838524955013364
car - banana : -0.1186120285659198
tiger - banana : -0.0009994601374418964
human - banana : -0.02015259609131672
wheat - banana : 0.1894259751643385


In [42]:
def print_similarity(base_word, words_to_compare):
  base_token = nlp(base_word)
  doc = nlp(words_to_compare)

  for token in doc:
    print(f"{token.text} <-> {base_token.text} : ", token.similarity(base_token))

In [43]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone :  0.4387907401919904
samsung <-> iphone :  0.670859081425417
iphone <-> iphone :  1.000000072144752
dog <-> iphone :  0.08211864228011527
kitten <-> iphone :  0.10222317834969896


In [51]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector

result = king - man + woman

In [52]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)